In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from tqdm import tqdm

from collections import Counter
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

from collections import defaultdict

In [2]:
import bz2
import pickle
import _pickle as cPickle

def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f:
        cPickle.dump(data, f)
        
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

In [3]:
team_files = [
    "exp16v6AugmixFullB4_fold0.pbz2",
]

In [4]:
FOLD=0

In [5]:
fold2 = pd.read_csv("/kaggle/happywhale/input/whale-21-3-2022/fold2.csv")
fold2.shape

(51033, 8)

In [6]:
Counter(fold2.fold)

Counter({-1: 9258, 1: 8319, 2: 8289, 3: 8374, 4: 8421, 0: 8372})

In [7]:
valid = fold2.query("fold == @FOLD").reset_index()
train = fold2.query("fold != @FOLD").reset_index()

In [8]:
valid.shape,train.shape

((8372, 9), (42661, 9))

In [9]:
valid_ind = (fold2.fold == FOLD).values
train_ind = (fold2.fold != FOLD).values

In [10]:
def get_embeddings(files):
    DIRNAME ="../input/whale-21-3-2022/"
  
    train_emb_lis = []
    test_emb_lis  = []
    
    for fname in files:
        print(fname)
        data = decompress_pickle(DIRNAME + fname)
        
        train_emb = data["emb_val"]#[new_index,:]
        test_emb  = data["emb_test"]

        train_emb_lis.append(train_emb)
        test_emb_lis.append(test_emb)
        
        print("    ",train_emb.shape,test_emb.shape)
    
    ret_train = np.concatenate(train_emb_lis,axis=1)
    ret_test  = np.concatenate(test_emb_lis,axis=1)
        
    return ret_train,ret_test
    

In [11]:
def get_valid_score(distances):
    
    K=1000
    predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]

    
    score_lis = []

    train_inddividual_id_lis = train["individual_id"].values
    valid_inddividual_id_lis = valid["individual_id"].values

    for i in tqdm(range(predicted_positions.shape[0])):
        nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
        nearest.sort(key=lambda x: x[1])

        prediction = [index_id for index_id, d in nearest]

        done = set()
        pred_list = []
        for pred_id in prediction:
            if pred_id in done:
                continue
            done.add(pred_id)
            pred_list.append(pred_id)
            if len(pred_list)==5:
                break

        if valid_inddividual_id_lis[i] in pred_list:
            s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
            score_lis.append(s)
        else:
            score_lis.append(0)    
            
    return score_lis

### euclidean distance with normalize

In [12]:
# this returns unnormalized embeddings
train_emb, _ = get_embeddings(team_files)

exp16v6AugmixFullB4_fold0.pbz2
     (51033, 1024) (27956, 1024)


In [13]:
train_emb.shape

(51033, 1024)

In [14]:
train_emb=normalize(train_emb)

In [15]:
valid_embeddings = train_emb[valid_ind]
train_embeddings = train_emb[train_ind]

In [16]:
distances = pairwise_distances(valid_embeddings,train_embeddings)

In [17]:
score_lis = get_valid_score(distances)

100%|██████████| 8372/8372 [00:07<00:00, 1138.70it/s]


In [18]:
np.mean(score_lis)

0.8401596591813983

In [19]:
Counter(score_lis)

Counter({1.0: 6764,
         0: 930,
         0.5: 367,
         0.2: 77,
         0.25: 85,
         0.3333333333333333: 149})

## cosine similarity WITHOUT normalize

In [20]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [21]:
# this returns unnormalized embeddings
train_emb, _ = get_embeddings(team_files)

exp16v6AugmixFullB4_fold0.pbz2
     (51033, 1024) (27956, 1024)


In [22]:
valid_embeddings = train_emb[valid_ind]
train_embeddings = train_emb[train_ind]

In [23]:
valid_embeddings.shape,train_embeddings.shape

((8372, 1024), (42661, 1024))

In [24]:
distances = cos_sim(valid_embeddings,train_embeddings.T)

In [25]:
# i want distances to be smaller is near
distances= -distances

In [26]:
score_lis = get_valid_score(distances)

100%|██████████| 8372/8372 [00:07<00:00, 1120.22it/s]


In [27]:
np.mean(score_lis)

0.7861562350692786

In [28]:
Counter(score_lis)

Counter({0.5: 463,
         1.0: 6219,
         0: 1220,
         0.25: 128,
         0.2: 111,
         0.3333333333333333: 231})

## cosine similarity WITH normalize

In [29]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [30]:
# this returns unnormalized embeddings
train_emb, _ = get_embeddings(team_files)

exp16v6AugmixFullB4_fold0.pbz2
     (51033, 1024) (27956, 1024)


In [31]:
train_emb=normalize(train_emb)

In [32]:
valid_embeddings = train_emb[valid_ind]
train_embeddings = train_emb[train_ind]

In [33]:
valid_embeddings.shape,train_embeddings.shape

((8372, 1024), (42661, 1024))

In [34]:
distances = cos_sim(valid_embeddings,train_embeddings.T)

In [35]:
# i want distances to be smaller is near
distances= -distances

In [36]:
score_lis = get_valid_score(distances)

100%|██████████| 8372/8372 [00:07<00:00, 1135.25it/s]


In [37]:
np.mean(score_lis)

0.8401596591813983

In [38]:
Counter(score_lis)

Counter({1.0: 6764,
         0: 930,
         0.5: 367,
         0.2: 77,
         0.25: 85,
         0.3333333333333333: 149})