A notebook to illustrate/test `kmod.mctest.SC_MMD`.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#%config InlineBackend.figure_format = 'pdf'

import kmod
import kgof
import kgof.goftest as gof
# submodules
from kmod import data, density, kernel, util
from kmod import mctest as mct
import matplotlib
import matplotlib.pyplot as plt
import autograd.numpy as np
import scipy.stats as stats

In [None]:
# font options
font = {
    #'family' : 'normal',
    #'weight' : 'bold',
    'size'   : 18
}

plt.rc('font', **font)
plt.rc('lines', linewidth=2)
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

## A simple 1d Gaussian problem

Two models $P = \mathcal{N}(\mu_p, \sigma_p^2)$ and $Q = \mathcal{N}(\mu_q, \sigma^2_q)$. The data generating distribution is $R = \mathcal{N}(0, 1)$.

    H_0: P, Q are equally good
    H_1: Q is better for approximating R

### Case 1: $Q$ is actually better than $P$

In [None]:
mp, varp = 0.55, 1.
# q cannot be the true model. 
# That violates our assumption and the asymptotic null distribution
# does not hold.
mq, varq = 0.5, 1.
# draw some data
n = 600 # sample size
seed = 9
with util.NumpySeedContext(seed=seed):
    X = np.random.randn(n, 1)*varp**0.5 + mp
    Y = np.random.randn(n, 1)*varq**0.5 + mq
    Z = np.random.randn(n, 1)
    
    datap = data.Data(X)
    dataq = data.Data(Y)
    datar = data.Data(Z)

In [None]:
# plot the data
plt.figure(figsize=(8, 4))
plt.hist(X, color='r', alpha=0.6, normed=True, label='X')
plt.hist(Y, color='b', alpha=0.6, normed=True, label='Y')
plt.hist(Z, color='k', alpha=0.8, normed=True, label='Z')
plt.title('H1: Y is closer to Z')
plt.legend()

### Use median heuristic for the Gaussian widths

In [None]:
# hyperparameters of the test
medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
medxyz = np.mean([medxz, medyz])
k = kernel.KGauss(sigma2=medxyz**2)

In [None]:
# construct a MMD similarity test
alpha = 0.01 # significance level 
scummd = mct.SC_MMD(datap, dataq, k, alpha=alpha)
scummd.perform_test(datar)

### Case 2: Q is not better than P

In [None]:
mp, varp = 0.8, 1.
# q cannot be the true model. 
# That violates our assumption and the asymptotic null distribution
# does not hold.
mq, varq = 0.8, 1.
# draw some data
n = 600 # sample size
seed = 8
with util.NumpySeedContext(seed=seed):
    X = np.random.randn(n, 1)*varp**0.5 + mp
    Y = np.random.randn(n, 1)*varq**0.5 + mq
    Z = np.random.randn(n, 1)
    
    datap = data.Data(X)
    dataq = data.Data(Y)
    datar = data.Data(Z)

In [None]:
# hyperparameters of the test
medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
medxyz = np.mean([medxz, medyz])
k = kernel.KGauss(sigma2=medxyz**2)

In [None]:
# construct a MMD similarity test
alpha = 0.01 # significance level 
scmmd = mct.SC_MMD(datap, dataq, k, alpha=alpha)
scmmd.perform_test(datar)

# Debugging

Normal usage will not need the following code. The following code is here for checking the implementation during the development.

Check the asymptotic distribution of the SC_MMD statistic.

In [None]:
def gen_test_samples(n, seed):
    """
    Return datap, dataq, datar
    """
    mp, varp = 1.3, 1
    mq, varq = 1.0, 1

    # draw some data
    
    with util.NumpySeedContext(seed=seed):
        X = np.random.randn(n, 1)*varp**0.5 + mp
        Y = np.random.randn(n, 1)*varq**0.5 + mq
        Z = np.random.randn(n, 1)

        datap = data.Data(X)
        dataq = data.Data(Y)
        datar = data.Data(Z)
    return datap, dataq, datar

In [None]:
seed = 988
n = 300 # sample size
datap, dataq, datar = gen_test_samples(n, seed)
X, Y, Z = [a.data() for a in [datap, dataq, datar]]

In [None]:
# plot the data
plt.figure(figsize=(8, 4))
plt.hist(X, color='r', alpha=0.6, normed=True, label='X')
plt.hist(Y, color='b', alpha=0.6, normed=True, label='Y')
plt.hist(Z, color='k', alpha=0.8, normed=True, label='Z')
plt.title('H1: Y is closer to Z')
plt.legend()

In [None]:
# hyperparameters of the test
medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
medyz = util.meddistance(np.vstack((Z, Y)), subsample=1000)
sigma2 = np.mean([medxz, medyz])**2
k = kernel.KGauss(sigma2=sigma2)

In [None]:
# number of times to create a new problem (draw new samples)
trials = 100
null_stats = np.zeros(trials)
alpha = 0.05 # significance level 

for t in range(trials):
    datap, dataq, datar = gen_test_samples(n, seed=t+83)
    # create an MMD test
    
    scmmd = mct.SC_MMD(datap, dataq, k, alpha=alpha)
    null_stats[t] = scmmd.compute_stat(datar)

# use the data in the last trial to perform test
results = scmmd.perform_test(datar)

display(results)

In [None]:
# get the parameters of the asymptotic null distribution
_, var_h0 = scmmd.get_H1_mean_variance(datar, return_variance=True)
dom =  np.linspace(np.min(null_stats)-1, np.max(null_stats)+2, 500)
ph0_values = stats.norm.pdf(dom, loc=0, scale=var_h0**0.5)

In [None]:
# histogram of the null stats
plt.figure(figsize=(8, 5))

plt.plot(dom, ph0_values, 'r-', label='Asymp. null dist.')
plt.hist(null_stats, label='Empirical ground truth', alpha=0.7, bins=15, normed=True);
# plt.hist(sim_stats, label='Asymptotic', alpha=0.7, bins=15, normed=True)
plt.legend()

When $H_0$ is true, the asymptotic null distribution is expected to be to the right of the empirically obtained statistics. This means that we will have type-I error which is lower than $\alpha$, but lose a bit of test power.