Investigate a toy problem in which r (data distribution) is multimodal, p and q can capture only some modes.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#%config InlineBackend.figure_format = 'pdf'

import kmod
import kgof
import kgof.goftest as gof
# submodules
from kmod import data, density, kernel, util
from kmod import mctest as mct
import matplotlib
import matplotlib.pyplot as plt
import autograd.numpy as np
import scipy.stats as stats

In [None]:
# font options
font = {
    #'family' : 'normal',
    #'weight' : 'bold',
    'size'   : 20
}

plt.rc('font', **font)
plt.rc('lines', linewidth=2)
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

## 1D Gaussian mixture problem

In [None]:
# mean of bump 1, 2, 3
m1, m2, m3 = 0, 15, 20
means = np.array([m1, m2, m3])
p = density.IsotropicNormal(np.array([m1]), 1)
q = density.IsoGaussianMixture(np.array([[m1, m2-2]]).T, np.array([1, 0.1]))
r = density.IsoGaussianMixture(np.array([[m2, m3]]).T, np.array([0.1, 1]), pmix=[0.5, 0.5])


In [None]:
# plot the densities
dom = np.linspace(np.min(means)-4, np.max(means)+3, 200)[:, np.newaxis]
denp = np.exp(p.log_normalized_den(dom))
denq = np.exp(q.log_normalized_den(dom))
denr = np.exp(r.log_normalized_den(dom))

plt.figure(figsize=(8, 5))
plt.plot(dom, denp, 'r-', label='p')
plt.plot(dom, denq, 'b-', label='q')
plt.plot(dom, denr, 'k-', label='r')
plt.legend()

In [None]:
# sample
n = 200
seed = 35
dsp, dsq, dsr = [P.get_datasource() for P in [p, q, r]]
datp, datq, datr = [ds.sample(n, seed=seed) for ds in [dsp, dsq, dsr]]
X, Y, Z = [D.data() for D in [datp, datq, datr]]

# see the samples
plt.figure(figsize=(8, 5))
a = 0.6
plt.hist(X, color='r', alpha=a, normed=True, label='X')
plt.hist(Y, color='b', alpha=a, normed=True, label='Y')
plt.hist(Z, color='k', alpha=a, normed=True, label='Z');
plt.legend()

In [None]:
# median heuristic
medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
print('medzy = {:.4g}, medyz = {:.4g}'.format(medxz, medyz))

# kernels from the median heuristic
mean_med = np.mean([medxz, medyz]) 
sigma2 = mean_med**2
print('mean_med = {:.4g}'.format(mean_med))
k = kernel.KGauss(sigma2=sigma2)

In [None]:
alpha = 0.05
scmmd = mct.SC_MMD(datp, datq, k, alpha=alpha)
scmmd.perform_test(datr)


## 1D Gaussian mixture problem 2

In [None]:
# mean of bump 1, 2, 3
m1, m2, m3 = 0, 3, 3
means = np.array([m1, m2, m3])
p = density.IsotropicNormal(np.array([m1]), 1)
q = density.IsoGaussianMixture(np.array([[m1, m2-1]]).T, np.array([1, 0.2**2]), pmix=[0.2, 0.8])
r = density.IsotropicNormal(np.array([m3]), 0.2**2)


In [None]:
# plot the densities
dom = np.linspace(np.min(means)-4, np.max(means)+3, 200)[:, np.newaxis]
denp = np.exp(p.log_normalized_den(dom))
denq = np.exp(q.log_normalized_den(dom))
denr = np.exp(r.log_normalized_den(dom))

plt.figure(figsize=(8, 5))
plt.plot(dom, denp, 'r-', label='p')
plt.plot(dom, denq, 'b-', label='q')
plt.plot(dom, denr, 'k-', label='r')
plt.legend()

In [None]:
# sample
n = 300
seed = 35
dsp, dsq, dsr = [P.get_datasource() for P in [p, q, r]]
datp, datq, datr = [ds.sample(n, seed=seed) for ds in [dsp, dsq, dsr]]
X, Y, Z = [D.data() for D in [datp, datq, datr]]

# see the samples
plt.figure(figsize=(8, 5))
a = 0.6
plt.hist(X, color='r', alpha=a, normed=True, label='X')
plt.hist(Y, color='b', alpha=a, normed=True, label='Y')
plt.hist(Z, color='k', alpha=a, normed=True, label='Z');
plt.legend()

In [None]:
# median heuristic
medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
print('medzy = {:.4g}, medyz = {:.4g}'.format(medxz, medyz))

# kernels from the median heuristic
mean_med = np.mean([medxz, medyz]) 
sigma2 = mean_med**2
print('mean_med = {:.4g}'.format(mean_med))
k = kernel.KGauss(sigma2=sigma2)

In [None]:
alpha = 0.05
scmmd = mct.SC_MMD(datp, datq, k, alpha=alpha)
scmmd.perform_test(datr)


## 2d grid of Gaussian mixture components

In [None]:
def rot2d_matrix(angle):
    import math
    r = np.array( [[math.cos(angle), -math.sin(angle)], [math.sin(angle), math.cos(angle)]] )
    return r

def rot2d_cov(angle, cov):
    R = rot2d_matrix(angle)
    return np.dot(np.dot(R, cov), R.T)

In [None]:
means = np.array([[-1.0, 1], [1, 1], [-1, -1], [1, -1]])*5
base_cov = np.array([[4.0, 0], [0, 0.5]])

# 4 isotropic covariance matrices in 2d
covr = np.tile(base_cov, [4, 1, 1])
covq = np.tile(rot2d_cov(np.pi/5.0, base_cov), [4, 1, 1])
covp = np.tile(rot2d_cov(np.pi/2.0, base_cov), [4, 1, 1])

p = density.GaussianMixture(means, covp)
q = density.GaussianMixture(means, covq)
r = density.GaussianMixture(means, covr)

In [None]:
# sample
n = 400
seed = 37
dsp, dsq, dsr = [P.get_datasource() for P in [p, q, r]]
datp, datq, datr = [ds.sample(n, seed=seed) for ds in [dsp, dsq, dsr]]
X, Y, Z = [D.data() for D in [datp, datq, datr]]

In [None]:
# see the samples in 2D
plt.figure(figsize=(8, 5))
a = 0.6
plt.plot(X[:, 0], X[:, 1], 'r.', label='X', alpha=a)
plt.plot(Y[:, 0], Y[:, 1], 'b.', label='Y', alpha=a)
plt.plot(Z[:, 0], Z[:, 1], 'k.', label='Z', alpha=a)
plt.legend()

In [None]:
# median heuristic
medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
print('medzy = {:.4g}, medyz = {:.4g}'.format(medxz, medyz))

# kernels from the median heuristic
mean_med = np.mean([medxz, medyz]) 
sigma2 = mean_med**2
print('mean_med = {:.4g}'.format(mean_med))
k = kernel.KGauss(sigma2=10**2)

In [None]:
alpha = 0.05
scmmd = mct.SC_MMD(datp, datq, k, alpha=alpha)
scmmd.perform_test(datr)
