In [None]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import torch
import gpytorch
from tqdm.notebook import trange
import heapq
import math
import pickle
from algorithms.cd import con_div
from algorithms.ccr import con_conv_rate
from utils.class_imbalance import get_classes, class_proportion

In [None]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

## Dataset

In [None]:
def sample_GMM(means, covs, num_samples):
    """
    Samples equally from clusters of normal distributions.
    """
    assert(means.shape[0] == covs.shape[0])
    assert(means.shape[1] == covs.shape[1])
    assert(covs.shape[1] == covs.shape[2])
    
    n = means.shape[0]
    d = means.shape[1]
    samples = np.zeros((num_samples, d))
    clusters = np.zeros(num_samples, dtype=np.int32)
    
    for i in range(num_samples):
        cluster = np.random.randint(n)
        samples[i] = np.random.multivariate_normal(means[cluster], covs[cluster], check_valid='raise')
        clusters[i] = cluster
    
    return samples, clusters

In [None]:
num_clusters = 5
d = 2
num_samples = 1000

In [None]:
np.random.seed(2)

In [None]:
means = np.random.uniform(size=(num_clusters, d))
covs = np.zeros((num_clusters, d, d))
for i in range(num_clusters):
    covs[i] = np.eye(d)/200

In [None]:
train_sets = np.zeros((num_clusters, num_samples, d))
test_sets = np.zeros((num_clusters, num_samples, d))

In [None]:
for i in range(num_clusters):
    train_sets[i] = np.random.multivariate_normal(means[i], covs[i], size=(num_samples), check_valid='raise')
    test_sets[i] = np.random.multivariate_normal(means[i], covs[i], size=(num_samples), check_valid='raise')

In [None]:
from utils.mmd import perm_sampling, mmd
import scipy.stats as stats

In [None]:
def neg_mmd_biased(X, Y, k):
    """
    Calculates biased MMD^2. A, B and C are the pairwise-XX, pairwise-XY, pairwise-YY summation terms respectively.
    :param X: array of shape (n, d)
    :param Y: array of shape (m, d)
    :param k: GPyTorch kernel
    :return: MMD^2, A, B, C
    """
    n = X.shape[0]
    m = Y.shape[0]
    X_tens = torch.tensor(X, dtype=torch.float32)
    Y_tens = torch.tensor(Y, dtype=torch.float32)

    A = (1 / (n ** 2)) * torch.sum(k(X_tens).evaluate())
    B = -(2 / (n * m)) * torch.sum(k(X_tens, Y_tens).evaluate())
    C = (1 / (m ** 2)) * torch.sum(k(Y_tens).evaluate())

    return -(A + B).item(), -A.item(), -B.item(), -C.item()

In [None]:
def perm_sampling_neg_biased(P, Q, k, num_perms=200, eta=1.0):
    """
    Shuffles two datasets together, splits this mix in 2, then calculates MMD to simulate P=Q. Does this num_perms
    number of times.
    :param P: First dataset, array of shape (n, d)
    :param Q: Second dataset, array of shape (m, d)
    :param k: GPyTorch kernel
    :param num_perms: Number of permutations done to get range of MMD values.
    :param eta: Fraction of samples taken in each shuffle. The larger this parameter, the smaller the variance in the estimate. Defaults
    to 0.5*(n+m)
    :return: Sorted list of MMD values.
    """
    mmds = []
    num_samples = int(eta * (P.shape[0] + Q.shape[0]) // 2)
    XY = np.concatenate((P, Q))

    for _ in trange(num_perms, desc="Permutation sampling"):
        p = np.random.permutation(len(XY))
        X = XY[p[:num_samples]]
        Y = XY[p[num_samples:num_samples*2]]
        mmds.append(neg_mmd_biased(X, Y, k)[0])
    return sorted(mmds)

In [None]:
num_candidate_points = 10000
num_parties = 10

gmm_clusters = [sample_GMM(means, covs, num_candidate_points) for i in range(num_clusters)]
X = gmm_clusters[0][0]
Y = gmm_clusters[1][0]

In [None]:
kernel = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=d))
kernel.base_kernel.lengthscale = [1, 1]
kernel.outputscale = 1

In [None]:
neg_mmd_biased(test_sets[1], test_sets[1], kernel)

In [None]:
neg_mmd_biased(test_sets[1][:100], test_sets[1][:100], kernel)

In [None]:
neg_mmd_biased(test_sets[0], test_sets[1], kernel)

In [None]:
log_etas = np.linspace(np.log(0.025), np.log(1.), 10)
etas = np.exp(log_etas)

In [None]:
etas

In [None]:
all_samps = []
for eta in etas:
    samps = perm_sampling_neg_biased(X[:4000], Y[:4000], kernel, num_perms=1000, eta=eta)
    all_samps.append(samps)

In [None]:
all_samps[0]

In [None]:
all_samps_untruncated = all_samps

In [None]:
all_samps = []
for samp in all_samps_untruncated:
    new_samp = []
    for val in samp:
        if val <= 0.9425499439239502:
            new_samp.append(val)
    all_samps.append(new_samp)

In [None]:
num_curves = 10

In [None]:
pickle.dump(all_samps, open("all_samps.p", "wb"))

In [None]:
all_x = []
all_density = []
for i in range(num_curves):
    bins = np.histogram(all_samps[i], bins=50)[1]
    interval = bins[1] - bins[0]
    bins = np.concatenate(([bins[0] - interval*i for i in range(700, 0, -1)], bins))
    density = stats.gaussian_kde(all_samps[i])
    n, x, _ = plt.hist(all_samps[0], bins=bins, 
                   histtype=u'step', density=True)  
    all_x.append(x)
    all_density.append(density)

In [None]:
pickle.dump(all_x, open("all_x.p", "wb"))

In [None]:
all_x[6]

In [None]:
all_density[6](0.936)

In [None]:
etas[6] = 0.292

In [None]:
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif"})

In [None]:
plt.figure(figsize=(8, 6), dpi=300)
plt.title("Effect of $\eta$ on variance of $\widehat{MMD}^2$ distribution")

for i in range(num_curves):
    if i in [0, 6, 9]:
        x = all_x[i]
        density = all_density[i]
        plt.plot([0] + x, density(x), label="$\eta = {}$".format(etas[i]), color=cm.get_cmap('Spectral')(i*0.1), linewidth=2)
        plt.legend()
        plt.ylabel("Unnormalized density")
        plt.xlabel("$v(X)$")
        plt.xlim(left=0.9325, right=0.9425499439239502)

In [None]:
neg_mmd_biased(X[:4000], Y[:4000], kernel)

In [None]:
neg_mmd_biased(test_sets[0], test_sets[1], kernel)

In [None]:
neg_mmd_biased(test_sets[1], test_sets[1], kernel)

In [None]:
neg_mmd_biased(test_sets[1][:100], test_sets[1], kernel)

In [None]:
neg_mmd_biased(X, Y, kernel)

In [None]:
neg_mmd_biased(X[:1000], Y[:1000], kernel)

In [None]:
neg_mmd_biased(Y[:1000], Y[:1000], kernel)

In [None]:
mmd(X[:1000], Y[:1000], kernel)

In [None]:
mmd(Y[1000], Y[], kernel)

In [None]:
[[0.2, 0.2, 0.2, 0.2, 0.2],
 [0.2, 0.2, 0.2, 0.2, 0.2],
 [0.6, 0.4, 0.0, 0.0, 0.0],
 [0.0, 0.2, 0.6, 0.2, 0.0],
 [0.0, 0.0, 0.0, 0.4, 0.6]]

In [None]:
lol = np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                [0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])

In [None]:
np.sum(lol, axis=1)