In [1]:
import pandas as pd
import numpy
import torch
import numpy as np
from tqdm import tqdm

In [2]:
train_X = pd.read_parquet('../data/interim/train_X.pq')
train_y = pd.read_parquet('../data/interim/train_y.pq')
val_X = pd.read_parquet('../data/interim/val_X.pq')
val_y = pd.read_parquet('../data/interim/val_y.pq')
test_X = pd.read_parquet('../data/interim/test_X.pq')

In [3]:
train_X.columns.str.contains('ft.+_0')

array([False, False,  True, ..., False, False, False])

In [4]:
defense_train_val = pd.concat([train_X.loc[train_y['defense'], 
                                           train_X.columns.str.contains('ft.+_0')],
                               val_X.loc[val_y['defense'], 
                                         val_X.columns.str.contains('ft.+_0')]])

In [5]:
del train_X, train_y, val_X, val_y

In [6]:
center_test_X = test_X.loc[:, test_X.columns.str.contains('ft.+_0')]

In [7]:
del test_X

In [8]:
def sim_matrix(a, b, eps=1e-8):
    """
    added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.clamp(a_n, min=eps)
    b_norm = b / torch.clamp(b_n, min=eps)
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

In [9]:
sim_list = list()
chunk_size = 1_000
i = 0
defense_train_val_np = defense_train_val.to_numpy()
B = torch.from_numpy(center_test_X.to_numpy()).to('cuda')
for _ in tqdm(range(int(np.ceil(defense_train_val.shape[0]/chunk_size))), 
              position=0):
    A = torch.from_numpy(defense_train_val_np[i:min(i+chunk_size, 
                                                    defense_train_val.shape[0]),:]).to('cuda')
    similarity_matrix = sim_matrix(A, B)
    numpy_sim_mat = similarity_matrix.data.cpu().numpy()
    sim_list.append(numpy_sim_mat)
    i += chunk_size

100%|██████████| 105/105 [01:25<00:00,  1.23it/s]


In [10]:
%%time
sim_mat = np.concatenate(sim_list)
sim_mat.shape

CPU times: user 43.4 s, sys: 1min 42s, total: 2min 26s
Wall time: 2min 28s


(104076, 176371)

In [11]:
del sim_list

In [12]:
test_max_similarity = sim_mat.max(axis=0)

In [13]:
len(test_max_similarity)

176371

In [14]:
out_df = pd.DataFrame(test_max_similarity, columns=['cosine_similarity'])
out_df.index = center_test_X.index

In [15]:
out_df.to_parquet('../data/interim/test_predictions_cosine_similarity.pq')