In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Colab warns and provides remediation steps if the GPUs is not compatible with RAPIDS.

!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [None]:
import sys
import os
from itertools import product
import math
import numpy
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from scipy.cluster.hierarchy import dendrogram, leaves_list
from scipy.spatial.distance import pdist
from sklearn.manifold import TSNE, MDS
import scipy
# import umap
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.axes_grid1 import make_axes_locatable
import time

import cupy as cp
import cuml

In [None]:
from cuml.kernel_ridge import KernelRidge

sys.path.append(os.path.abspath("/content/drive/MyDrive/UKP/"))
from distance_functions_final import *

# Set the device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lmbda_range = np.power(10.0, range(-7, 2))
lmbda_range = np.concatenate((lmbda_range, [0]))
sigma_range = np.power(10.0, range(-3, 3))

import scipy.stats

folder = "/content/drive/MyDrive/UKP/mnist_experiments/distances/widthdepth/5000_train/"
folder_categorized = "/content/drive/MyDrive/UKP/mnist_experiments/distances/widthdepth/5000_train_categorized"
# loading test distance file and seeing its entries
data = np.load(f"{folder}/width100_depth2_seed2_rep_width100_depth4_seed2_rep.npz")
print(data.files)

distnames = data.files

# Code to extract each different distance into its own array
## done on Google colab using notebook separating distance into separate files.ipynb under mnist_experiments

model_names = []
reps_folder = f"/content/drive/MyDrive/UKP-/mnist_experiments/reps/train/5000_eval"
filenames = os.listdir(reps_folder)
for filename in filenames:
  if "saved" not in filename and "seed2" in filename and "depth3" not in filename and "depth6" not in filename and "depth8":
    model_names.append(filename[:-4])


model_names = np.sort(model_names)
num_models = len(model_names)

dist_array = np.load(f'{folder_categorized}/all_distances_categorized.npz')

# Loading representations (both train and test)

rep_folder_prefix = '/content/drive/MyDrive/UKP/mnist_experiments/'
train_reps_folder = rep_folder_prefix + 'reps/train/5000_eval/'
val_reps_folder = rep_folder_prefix + 'reps/test/5000_eval/'

dist_n = 5000
# Load ImageNet representations
reps_train = {}  # Train dataset
reps_test = {}  # Validation dataset
try:
    for model_name in model_names:
        print(model_name)
        rep1 = np.load(train_reps_folder + model_name + ".npy")

        # use only dist_n samples
        rep1 = rep1[:, :dist_n]

        # center and normalize

        rep1 = rep1 - rep1.mean(axis=1, keepdims=True)
        rep1 = rep1 / np.linalg.norm(rep1)
        rep1 = rep1 * np.sqrt(rep1.shape[1])
        reps_train[model_name] = rep1

        rep2 = np.load(val_reps_folder + model_name + ".npy")

        # Use only dist_n samples
        rep2 = rep2[:, :dist_n]

        # center and normalize
        rep2 = rep2 - rep2.mean(axis=1, keepdims=True)
        rep2 = rep2 / np.linalg.norm(rep2)
        rep2 = rep2 * np.sqrt(rep2.shape[1])
        reps_test[model_name] = rep2

except FileNotFoundError as e:
    print('WARNING: IN ORDER TO RUN THIS CODE, THE IMAGENET REPRESENTATIONS MUST COMPUTED. SEE README.')
    raise e



In [None]:
def find_best_pred(y, lmbda, sigma, reps, kernelstr):
    # ridge regression
    # assume reps is dimension x number datapoints
    rep_dim = reps.shape[0]
    numpts = reps.shape[1]

    if kernelstr == 'lap':
        krr = CustomKernelRidge(alpha=lmbda, sigma=sigma)
    if kernelstr == 'rbf':
        gamma = 1/ ((2*sigma)**2)
        krr = KernelRidge(alpha=lmbda, kernel='rbf', gamma = gamma)
    krr.fit(reps.T, y) # If this does not work will go to dual coef
    return krr


def find_best_lin_pred(y, lmbda, reps):
    # ridge regression
    # assume reps is dimension x number datapoints
    rep_dim = reps.shape[0]
    numpts = reps.shape[1]

    return np.linalg.solve((lmbda * np.eye(rep_dim) + (reps @ reps.T) / numpts), reps @ y)


def symmetrize(A):
    n = A.shape[0]
    B = A.copy()
    B[np.tril_indices(n)] = B.T[np.tril_indices(n)]
    return B

def dist_from_upper_tri_vec(vec, num_models):
    D = np.zeros((num_models, num_models))
    row_indices, col_indices = np.triu_indices(num_models, k=1)
    D[row_indices, col_indices] = vec
    D = symmetrize(D)

    return D

def flatten_upper_right_triangle(curr_mat):
    cv = []
    assert (curr_mat.shape[0] == curr_mat.shape[1])
    assert (curr_mat.shape[0] == len(model_names))
    for i in range(len(model_names) - 1):
        for j in range(i + 1, len(model_names)):
            cv.append(curr_mat[i, j])
    cv = np.asarray(cv)
    return cv

def laplace_kernel(X, Y, sigma = 1.0):
    return np.exp(-np.abs(X[:,np.newaxis] - Y[np.newaxis,:]) /sigma)

distances = {}
for i in range(len(distnames)):
    distname = distnames[i]
    print(distname)
    distances[distname] = dist_from_upper_tri_vec(dist_array[distname],num_models)
    print(distances[distname])

### Upto this okay
def get_collected_correlations_lintasks(lmbda, numtrials=50, numtrainsamples=5000):
    collected_correlations = []

    labels = []
    # for ky, val in distances.items():
    #     if ky != 'predictor_dist_range':
    #         labels.append(ky)

    for ky in distances.keys():
        if ky != 'predictor_dist_range':
            labels.append(ky)

    for tri in range(numtrials):
        print(f'Trial {tri}')

        y = np.random.randn(numtrainsamples, 1) + 1

        preds = {}
        for model_name in model_names:
            #         print(model_name)
            preds[model_name] = find_best_lin_pred(y, lmbda, reps_train[model_name][:, 0:numtrainsamples])

        #     # For each pair, compute the squared distance between predictions, averaged over test instances

        errs = np.zeros((len(model_names), len(model_names)))
        for ind1 in range(0, len(model_names) - 1):
            for ind2 in range(ind1 + 1, len(model_names)):
                cp1 = preds[model_names[ind1]].T @ reps_test[model_names[ind1]]
                cp2 = preds[model_names[ind2]].T @ reps_test[model_names[ind2]]
                #             print(cp1.shape)
                errs[ind1, ind2] = np.linalg.norm(cp1 - cp2)
                errs[ind2, ind1] = errs[ind1, ind2]
        err_vec = flatten_upper_right_triangle(errs)

        correlations = []

        for distname in labels:
            val = scipy.stats.spearmanr(err_vec, dist_array[distname]).correlation
            correlations.append(val)

        collected_correlations.append(correlations)

    return labels, collected_correlations



In [None]:
import gc

In [None]:
err_folder = f"/content/drive/MyDrive/UKP/mnist_experiments/err_folder/"

In [None]:
def get_collected_errs_kertasks(lmbda, sigma, numtrials=30, numtrainsamples=5000,numtestsamples=5000):
    collected_RBF_correlations = []
    # collected_Laplace_correlations = []

    labels = []
    # for ky, val in distances.items():
    #     if ky != 'predictor_dist_range':
    #         labels.append(ky)

    for ky in distances.keys():
        if ky != 'predictor_dist_range':
            labels.append(ky)

    for tri in range(numtrials):
        print(f'Trial {tri}')

        y = np.random.randn(numtrainsamples, 1) + 1

        preds_RBF = {}
        # preds_Laplace = {}
        for model_name in model_names:
            preds_RBF[model_name] = find_best_pred(y, lmbda, sigma, torch.tensor(reps_train[model_name][:, 0:numtrainsamples]).to(device), 'rbf')
        #     # For each pair, compute the squared distance between predictions, averaged over test instances

        errs_RBF = np.zeros((len(model_names), len(model_names)))
        for ind1 in range(0, len(model_names) - 1):
            print(f"Doing i={ind1}")
            test_data_1 = torch.tensor(reps_test[model_names[ind1]][:, 0:numtestsamples]).T.to(device)
            cp1_RBF = cp.asnumpy(preds_RBF[model_names[ind1]].predict(test_data_1))
            for ind2 in range(ind1 + 1, len(model_names)):
                print(f"Doing i={ind1}. j={ind2}")
                test_data_2 = torch.tensor(reps_test[model_names[ind2]][:, 0:numtestsamples]).T.to(device)
                cp2_RBF = cp.asnumpy(preds_RBF[model_names[ind2]].predict(test_data_2))
                del test_data_2
                gc.collect()
                errs_RBF[ind1, ind2] = np.linalg.norm(cp1_RBF - cp2_RBF)
                errs_RBF[ind2, ind1] = errs_RBF[ind1, ind2]

            del test_data_1
            gc.collect()
             # Ensure all GPU operations are done
            torch.cuda.synchronize()

            # Clear GPU memory cache
            torch.cuda.empty_cache()

        err_RBF_vec = flatten_upper_right_triangle(errs_RBF)

        np.save(f"{err_folder}/{lmbda:7e}_{sigma:7e}_tri{tri}_numtrial{numtrials}_numtrainsamples{numtrainsamples}_err_RBF_vec.npy", err_RBF_vec)
        print(f"Trial {i} Done")

In [None]:
lmbda_range = np.array([np.power(10.0, -2), 1.0])
sigma_range = np.array([np.power(10.0, -1), 1.0])

In [None]:
np.random.seed(42)
for lmbda_ind in range(len(lmbda_range)):
  for sigma_ind in range(len(sigma_range)):
    get_collected_errs_kertasks(lmbda_range[lmbda_ind], sigma_range[sigma_ind], numtrials=30, numtrainsamples=5000, numtestsamples=5000)