In [None]:
import torch
import pandas as pd
import numpy as np
import random 
import math
from tqdm import tqdm
from IPython.display import clear_output
import itertools 
import operator
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, LinearLocator, AutoMinorLocator

import seaborn as sns
from scipy import stats
from copy import deepcopy
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from statsmodels.stats.multitest import multipletests


### Description:
This notebook contains code to test for the similarity of the distance matrices. We test whether the embedding matrices produced with different subset of data are statistically similar.

In [None]:
## analysis for version 1.33
v = "4.02"#"1.33"
subset_versions = ["9.0" , "9.11", "9.21"]
save_path = r"../%s/" %v
data_path = r"../token_embeddings/tensors.tsv"
vocab_path = r"../global_set/result.tsv"
paths = [data_path %sv for sv in subset_versions]

vocab = pd.read_csv(vocab_path, sep="\t").set_index("ID")


In [None]:
def upper(df):
    '''Returns the upper triangle of a correlation matrix.
    You can use scipy.spatial.distance.squareform to recreate matrix from upper triangle.
    Args:
      df: pandas or numpy correlation matrix
    Returns:
      list of values from upper triangle
    '''
    try:
        assert(type(df)==np.ndarray)
    except:
        if type(df)==pd.DataFrame:
            df = df.values
        else:
            raise TypeError('Must be np.ndarray or pd.DataFrame')
    mask = np.triu_indices(df.shape[0], k=1)
    return df[mask]

In [None]:
###
def get_random_pairs(numbers:list, num_pairs: int): 
    """Generate random integer pairs"""
    random.seed(0)
    pairs = list(itertools.combinations(numbers, 2)) 
    random.shuffle(pairs)
    pairs = pairs[:num_pairs]
    return pairs 

In [None]:
### cosine distances
def dot_product2(v1, v2):
    return sum(map(operator.mul, v1, v2))
    
def vector_cos(v1, v2):
    prod = dot_product2(v1, v2)
    len1 = math.sqrt(dot_product2(v1, v1))
    len2 = math.sqrt(dot_product2(v2, v2))
    return prod / (len1 * len2)

In [None]:
def pairwise_distances(x, pairs):
    euclidean_dist = []
    cosine_dist = []
    for pair in pairs: 
        a = x[pair[0]]
        b = x[pair[1]]
        euclidean_dist.append(np.linalg.norm(a-b))
        cosine_dist.append(vector_cos(a,b))
    return euclidean_dist, cosine_dist
    

In [None]:
vocab_size = 2043
pairs = get_random_pairs([i for i in range(7, vocab_size)], num_pairs = 10000)

In [None]:
np.random.seed(0)
embeddings = list()
for path in paths:
    e = pd.read_csv(path, sep="\t", header=None).values
    mu = e.sum(0) / (vocab.shape[0] - 6)
    e= e - mu
    embeddings.append(e)
permuted = deepcopy(embeddings[0])
for i in range(e.shape[1]):
    permuted[:,i] = np.random.permutation(permuted[:,i])
randome = np.random.normal(permuted.mean(), permuted.std(), size=permuted.shape)

## Robustness

In [None]:
eucl_1, cos_1 = pairwise_distances(embeddings[0], pairs)
eucl_2, cos_2 = pairwise_distances(embeddings[1], pairs)
eucl_3, cos_3 = pairwise_distances(embeddings[2], pairs)
eucl_r, cos_r = pairwise_distances(randome, pairs)
eucl_p, cos_p = pairwise_distances(permuted, pairs)

In [None]:
max_points = 500

In [None]:
def outlier_detection(coefs, x, y, max_accept_deviation: int = 1):
    fn = np.poly1d(coefs)
    residual = np.abs(y - fn(x))
    return max_accept_deviation < residual

In [None]:
### Linear fit
coef_1 = np.polyfit(eucl_1[:max_points], eucl_2[:max_points], 1)
poly_1_fn = np.poly1d(coef_1)
### 
coef_2 = np.polyfit(eucl_1[:max_points], eucl_3[:max_points], 1)
poly_2_fn = np.poly1d(coef_1)

#####
fig, ax = plt.subplots(1,2, figsize=(20,7))
ax[0].set_title("Pairwise Euclidean Distances (distance comparison)")
ax[0].set_xlabel("Distance (Set X)")
ax[0].set_ylabel("Distance (Set Y)")
ax[0].scatter(eucl_1[:max_points], eucl_2[:max_points], s=5)
ax[0].scatter(eucl_1[:max_points], eucl_3[:max_points], s=5)

ax[0].scatter(eucl_1[:max_points], eucl_p[:max_points], s=5, alpha=0.3, marker="*")
ax[0].legend(["Set 1 vs Set 2", "Set 1 vs Set 3", "Set 1 vs Permuted"])



ax[0].plot(eucl_1, poly_1_fn(eucl_1), linestyle="dashed", alpha = 0.5)
ax[0].plot(eucl_1, poly_2_fn(eucl_1), linestyle="dashed", alpha = 0.5)
ax[0].axis("scaled")

ax[0].tick_params(axis= "both", which="major", width=1, length = 6, direction="out", color="gray")
ax[0].tick_params(axis= "both", which="minor", width=1, length =3, direction="out", color="gray")

ax[0].yaxis.set_major_locator(MultipleLocator(0.5))
ax[0].yaxis.set_minor_locator(AutoMinorLocator(5))

ax[0].xaxis.set_major_locator(MultipleLocator(0.5))
ax[0].xaxis.set_minor_locator(AutoMinorLocator(5))


ax[1].set_title("Distribution of Pairwise Euclidean Distances")
ax[1].set_xlabel("Euclidean Distance")
ax[1].set_ylabel("Frequency")
bins = np.linspace(start=0, stop=12, num=50)

ax[1].hist(eucl_r, density = True, bins=bins, histtype="stepfilled", alpha=0.3, color="red")
ax[1].hist(eucl_p, density = True, bins=bins, histtype="stepfilled", alpha= 0.3, color="orange")
ax[1].hist(eucl_1, density = True, bins=bins, histtype="stepfilled", alpha=0.3, color="gray")
ax[1].hist(eucl_2, density = True, bins=bins, histtype="stepfilled", alpha=0.3, color="green")
ax[1].hist(eucl_3, density = True, bins=bins, histtype="stepfilled", alpha=0.3, color="blue")

ax[1].hist(eucl_r, density = True, bins=bins, histtype="step", linewidth=2.5, color="red")
ax[1].hist(eucl_p, density = True, bins=bins, histtype="step", linewidth=2.5, color="orange")
ax[1].hist(eucl_1, density = True, bins=bins, histtype="step", linewidth=2.5, color="gray")
ax[1].hist(eucl_2, density = True, bins=bins, histtype="step", linewidth=2.5, color="green")
ax[1].hist(eucl_3, density = True, bins=bins, histtype="step", linewidth=2.5, color="blue")

ax[1].legend([ "Random", "Permuted", "Set 1", "Set 2", "Set 3"])

ax[1].tick_params(axis= "both", which="major", width=1, length = 6, direction="out", color="gray")
ax[1].tick_params(axis= "both", which="minor", width=1, length =3, direction="out", color="gray")

ax[1].yaxis.set_major_locator(MultipleLocator(0.5))
ax[1].yaxis.set_minor_locator(AutoMinorLocator(5))

ax[1].xaxis.set_major_locator(MultipleLocator(1))
ax[1].xaxis.set_minor_locator(AutoMinorLocator(5))

plt.tight_layout()
sns.despine()
plt.savefig(save_path + "/life_emb_pairwise_euclidean.svg", format="svg")
plt.show()

In [None]:
### Linear fit
coef_1 = np.polyfit(cos_1[:max_points], cos_2[:max_points], 1)
poly_1_fn = np.poly1d(coef_1)
### 
coef_2 = np.polyfit(cos_1[:max_points], cos_3[:max_points], 1)
poly_2_fn = np.poly1d(coef_2)
#####
coef_3 = np.polyfit(cos_2[:max_points], cos_3[:max_points], 1)
poly_3_fn = np.poly1d(coef_3)

#####
coef_4 = np.polyfit(cos_1[:max_points], cos_p[:max_points], 1)
poly_4_fn = np.poly1d(coef_4)


fig, ax = plt.subplots(1,2, figsize=(20,7))
ax[0].set_title("Pairwise Cosine Distances (distance comparison)")
ax[0].set_xlabel("Distance (Set X)")
ax[0].set_ylabel("Distance (Set Y)")
ax[0].scatter(cos_1[:max_points], cos_2[:max_points], marker=".", s=5)
ax[0].scatter(cos_1[:max_points], cos_3[:max_points], marker=".", s=5)
ax[0].scatter(cos_2[:max_points], cos_3[:max_points], marker=".", s=5)
ax[0].scatter(cos_1[:max_points], cos_p[:max_points], alpha=0.3, s=5, marker=".", color="grey")
plt.legend(["Set 1 vs Set 2", "Set 1 vs Set 3", "Set 2 vs Set 3", "Set 1 vs Permuted"])


ax[0].plot(cos_1, poly_1_fn(cos_1), linestyle="dashed", alpha = 0.2)
ax[0].plot(cos_1, poly_2_fn(cos_1), linestyle="dashed", alpha = 0.2)
ax[0].plot(cos_2, poly_3_fn(cos_2), linestyle="dashed", alpha = 0.2)
ax[0].plot(cos_1, poly_4_fn(cos_1), linestyle="dashed", alpha = 0.2)
ax[0].axis("scaled")


ax[1].set_title("Distribution of Pairwise Cosine Distances")
ax[1].set_xlabel("Pairwise Distance")
ax[1].set_ylabel("Distribution")

ax[0].tick_params(axis= "both", which="major", width=1, length = 6, direction="out", color="gray")
ax[0].tick_params(axis= "both", which="minor", width=1, length =3, direction="out", color="gray")

ax[0].yaxis.set_major_locator(MultipleLocator(0.1))
ax[0].yaxis.set_minor_locator(AutoMinorLocator(5))

ax[0].xaxis.set_major_locator(MultipleLocator(0.1))
ax[0].xaxis.set_minor_locator(AutoMinorLocator(5))

bins = np.linspace(start=-0.5, stop=1, num=50)

ax[1].hist(cos_r, density = True, bins=bins, histtype="stepfilled", alpha=0.3, color="red")
ax[1].hist(cos_p,  density = True, bins=bins, histtype="stepfilled", alpha= 0.3, color="orange")
ax[1].hist(cos_1, density = True, bins=bins, histtype="stepfilled", alpha=0.3, color="gray")
ax[1].hist(cos_2, density = True, bins=bins, histtype="stepfilled", alpha=0.3, color="green")
ax[1].hist(cos_3, density = True, bins=bins, histtype="stepfilled", alpha=0.3, color="blue")


ax[1].hist(cos_r, density = True, bins=bins, histtype="step", linewidth=2.5, color="red")
ax[1].hist(cos_p,  density = True, bins=bins, histtype="step", linewidth=2.5, color="orange")
ax[1].hist(cos_1, density = True, bins=bins, histtype="step", linewidth=2.5, color="gray")
ax[1].hist(cos_2, density = True, bins=bins, histtype="step", linewidth=2.5, color="green")
ax[1].hist(cos_3, density = True, bins=bins, histtype="step", linewidth=2.5, color="blue")

ax[1].legend(["Random Embedding", "Permuted Embedding", "Set 1", "Set 2", "Set 3"])

ax[1].tick_params(axis= "both", which="major", width=1, length = 6, direction="out", color="gray")
ax[1].tick_params(axis= "both", which="minor", width=1, length =3, direction="out", color="gray")

ax[1].yaxis.set_major_locator(MultipleLocator(1))
ax[1].yaxis.set_minor_locator(AutoMinorLocator(5))

ax[1].xaxis.set_major_locator(MultipleLocator(0.2))
ax[1].xaxis.set_minor_locator(AutoMinorLocator(5))

plt.tight_layout()
sns.despine()
plt.savefig(save_path + "/life_emb_pairwise_cosine.svg", format="svg")
plt.show()

### Permutation Test (Statistical Significance)
Comparing:c1 -> c2, c3, cr, cp

In [None]:
c1 = cosine_distances(embeddings[0])
c2 = cosine_distances(embeddings[1])
c3 = cosine_distances(embeddings[2])
cp = cosine_distances(permuted)
cr = cosine_distances(randome)

In [None]:
stats.spearmanr(upper(c1), upper(c2)), stats.spearmanr(upper(c1), upper(cp))

In [None]:
def permutation_test(a, b, n_iter: int = 5000):
    """Nonparametric permutation testing Monte Carlo"""
    m1 = pd.DataFrame(a)
    m2 = pd.DataFrame(b)
    np.random.seed(0)
    rhos = []
    true_rho, _ = stats.spearmanr(upper(m1), upper(m2))
    # matrix permutation, shuffle the groups
    m_ids = list(m1.columns)
    m2_v = upper(m2)
    for i in tqdm(range(n_iter)):
        np.random.shuffle(m_ids) # shuffle list 
        r, _ = stats.spearmanr(upper(m1.loc[m_ids, m_ids]), m2_v)  
        rhos.append(r)
    return ((np.sum(np.abs(true_rho) <= np.abs(rhos)))+1)/(n_iter+1) # two-tailed test

In [None]:
clear_output()
print("1 vs 2")
p12 = permutation_test(c1,c2)
clear_output()
print(p12)
print("1 vs 3")
p13 = permutation_test(c1,c3)
clear_output()
print(p12, p13)
print("2 vs 3")
p23 = permutation_test(c2,c3)
clear_output()
print(p12, p13, p23)
print("1 vs R")
p1r = permutation_test(c1,cr)
clear_output()
print(p12, p13, p23, p1r)
print("1 vs P")
p1p = permutation_test(c1,cp)
print(p12, p13, p23, p1r, p1p)

In [None]:
p_vals = [p12, p13, p23, p1r, p1p]
p_vals

In [None]:
labels = ["1 vs 2", "1 vs 3", "2 vs 3", "1 vs R", "1 vs P"]
reject, p_corr, alpha_sidak, alpha_bf  = multipletests(p_vals, method="fdr_bh")
for i in range(len(labels)):
    print("%s (p = %.4f) || Reject: %s" %(labels[i], p_corr[i], reject[i]))