# KDE - Kernel density estimation
Estimate the probability density function of a random variable.     
What are kernels?
![Kernels](data/img/kernels.png)

In [None]:
# http://qingkaikong.blogspot.com/2018/05/kernel-density-estimation-animation.html
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import time
%matplotlib inline

We create som random, noisy data with hotspots at -5 and 10 and plot the relative distribution as as binned histogram

In [None]:
np.random.seed(12)
X = np.concatenate((np.random.normal(loc = -5, scale= 2, size = 100), np.random.normal(loc = 10, scale= 3, size = 100)))
# let's shuffle the order of the data (this is most for the animation later)
np.random.shuffle(X)
fig, ax = plt.subplots()
ax.hist(X, bins = 15, density=True)
ax.plot(X, np.zeros(len(X)), '*k')
ax.set_ylim((-0.01, 0.1))
ax.set_xlabel('Values')
ax.set_ylabel('Frequencies')
fig.show()

Now we create a function to generate a single gaussian kernel on a point given.

In [None]:
def gaussian_kernel(x, mu, sigma):
    return 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (x - mu)**2 / (2 * sigma**2) )

# generate grid
min_v = np.min(X)
max_v = np.max(X)
grid = np.linspace(min_v, max_v, 100)

# plot out the example
fig, ax = plt.subplots()
ax.plot(grid, gaussian_kernel(grid, mu = 5, sigma = 1.0))
ax.set_title('Example of a kernel centered at 5')
fig.show()

In [None]:
counter = 0 #rest graph

This animation demonstrates how KDE is working. The KDE is the sum of all underlying kernels

In [None]:
fig, ax = plt.subplots()
fig.show()
fig.canvas.draw()
density_estimation = np.zeros(len(grid))

line1, = ax.plot(grid, density_estimation, 'r-', label = 'Sum of all kernels')
ax.set_ylim(-0.01, 0.1)
ax.legend()
ax.set_xlabel('Values')
ax.set_ylabel('Density')

for c in range(counter):
    ax.plot(X[c], 0, '*k')
    kernel = gaussian_kernel(grid, mu = X[c], sigma = 1.0)
    density_estimation += kernel / len(X)
    ax.plot(grid, kernel / 8, 'k', alpha = 0.1)
    line1.set_data(grid, density_estimation)
    fig.canvas.draw()
    fig.show()
counter +=1 # you can increase the stepsize

Of couse we are not the first one using this function. this is alreasdy integrsted in a statistics module of scipy.    
This function gives the same result

In [None]:
from scipy import stats
kernel = stats.gaussian_kde(X, 0.1)
fig, ax = plt.subplots()
ax.plot(grid,kernel.evaluate(grid), 'r-')
ax.set_xlabel('Values')
ax.set_ylabel('Density')
fig.show()

# KDE in 2D
What if we want to estimate the kernel density of 2D data? That's wat generates Heatmaps.

In [None]:
# generate 2D random data this time
np.random.seed(12)
X = np.concatenate((np.random.normal(loc = -5, scale= 2, size = (100,2)), np.random.normal(loc = 10, scale= 3, size = (100,2))))

# get the mesh
m1, m2 = X[:, 0], X[:, 1]
xmin = m1.min()
xmax = m1.max()
ymin = m2.min()
ymax = m2.max()

# get the density estimation 
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)

# plot the result

fig, ax = plt.subplots(figsize = (10,8))
i = ax.imshow(np.rot90(Z), cmap=plt.cm.jet,
           extent=[xmin, xmax, ymin, ymax])

fig.colorbar(i)
ax.plot(m1, m2, 'k.', markersize=5)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
ax.set_xlabel('X values')
ax.set_ylabel('y values')
fig.show()