Majority vote from team's CV

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import glob

from tqdm import tqdm
import seaborn as sns
from collections import Counter

import cv2

In [2]:
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

In [3]:
from collections import Counter
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

In [4]:
from collections import defaultdict

In [5]:
import bz2
import pickle
import _pickle as cPickle

def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f:
        cPickle.dump(data, f)
        
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

In [6]:
def get_embeddings(files):
    DIRNAME ="../input/whale-21-3-2022/"
  
    train_emb_lis = []
    test_emb_lis  = []
    
    for fname in files:
        print(fname)
        data = decompress_pickle(DIRNAME + fname + ".pbz2")
        
        train_emb = data["emb_val"]#[new_index,:]
        test_emb  = data["emb_test"]
        
        train_emb=normalize(train_emb)
        test_emb=normalize(test_emb)

        train_emb_lis.append(train_emb)
        test_emb_lis.append(test_emb)
        
        print("    ",train_emb.shape,test_emb.shape)
    
    ret_train = normalize(np.concatenate(train_emb_lis,axis=1))
    ret_test  = normalize(np.concatenate(test_emb_lis,axis=1))
        
    return ret_train,ret_test
    

In [7]:
def get_valid_score(distances):
    
    K=1000
    predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]

    
    score_lis = []

    train_inddividual_id_lis = train["individual_id"].values
    valid_inddividual_id_lis = valid["individual_id"].values

    for i in tqdm(range(predicted_positions.shape[0])):
        nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
        nearest.sort(key=lambda x: x[1])

        prediction = [index_id for index_id, d in nearest]

        done = set()
        pred_list = []
        for pred_id in prediction:
            if pred_id in done:
                continue
            done.add(pred_id)
            pred_list.append(pred_id)
            if len(pred_list)==5:
                break

        if valid_inddividual_id_lis[i] in pred_list:
            s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
            score_lis.append(s)
        else:
            score_lis.append(0)    
            
    return score_lis

In [8]:
d={
    "exp16v4sampler_fold0": 0.8045250039815257,
    "exp16v6AugmixFullB4_fold0": 0.845,
    "exp16v6AugmixB4Step2_fold0": 0.8076166587036152,
    "exp16v6AugmixB4Step2_fold0_remove_background": 0.8076166587036152,
    "exp16v6AugmixB4_fold0": 0.808689679885332,
    "exp16v6Augmix_fold0": 0.8142837235228539,
    "exp16v6sampler_fold0": 0.8119127249562033,
    "exp21b6_fold0": 0.7997491638795986,}

In [9]:
exps = list(d.keys())
exps

['exp16v4sampler_fold0',
 'exp16v6AugmixFullB4_fold0',
 'exp16v6AugmixB4Step2_fold0',
 'exp16v6AugmixB4Step2_fold0_remove_background',
 'exp16v6AugmixB4_fold0',
 'exp16v6Augmix_fold0',
 'exp16v6sampler_fold0',
 'exp21b6_fold0']

In [10]:
FOLD=0

In [11]:
fold2 = pd.read_csv("/kaggle/happywhale/input/whale-21-3-2022/fold2.csv")
fold2.shape

(51033, 8)

In [12]:
valid = fold2.query("fold == @FOLD").reset_index()
train = fold2.query("fold != @FOLD").reset_index()

In [13]:
valid_ind = (fold2.fold == FOLD).values
train_ind = (fold2.fold != FOLD).values

### concat result

In [14]:
train_emb, _ = get_embeddings(exps)

exp16v4sampler_fold0
     (51033, 512) (27956, 512)
exp16v6AugmixFullB4_fold0
     (51033, 1024) (27956, 1024)
exp16v6AugmixB4Step2_fold0
     (51033, 512) (27956, 512)
exp16v6AugmixB4Step2_fold0_remove_background
     (51033, 512) (27956, 512)
exp16v6AugmixB4_fold0
     (51033, 512) (27956, 512)
exp16v6Augmix_fold0
     (51033, 512) (27956, 512)
exp16v6sampler_fold0
     (51033, 512) (27956, 512)
exp21b6_fold0
     (51033, 512) (27956, 512)


In [15]:
valid_preds = train_emb[valid_ind]
train_preds = train_emb[train_ind]

In [16]:
train_preds.shape,valid_preds.shape

((42661, 4608), (8372, 4608))

In [17]:
distances = pairwise_distances(valid_preds,train_preds)

In [18]:
score_lis = get_valid_score(distances)

100%|██████████| 8372/8372 [00:08<00:00, 994.59it/s] 


In [19]:
np.mean(score_lis)

0.8523590539894887

### majoriy vote result

In [20]:
train_inddividual_id_lis = train["individual_id"].values
valid_inddividual_id_lis = valid["individual_id"].values
def do_vote(exps, vote_result):
    
    train_emb, _ = get_embeddings(exps)
    
    valid_preds = train_emb[valid_ind]
    train_preds = train_emb[train_ind]

    
    distances = pairwise_distances(valid_preds,train_preds)
    
    K=1000
    predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]
    
    for i in tqdm(range(predicted_positions.shape[0])):
        nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
        nearest.sort(key=lambda x: x[1])

        prediction = [index_id for index_id, d in nearest]
        
        done = set()
        pred_list = []
        for pred_id in prediction:
            if pred_id in done:
                continue
            done.add(pred_id)
            pred_list.append(pred_id)
            if len(pred_list)==20:
                break

        for j,p in enumerate(pred_list):
            vote_result[i][p] += 1/(j+1)
        

In [21]:
vote_result = defaultdict(lambda:defaultdict(int))
for i in range(len(exps)):
    do_vote([exps[i]],vote_result)

exp16v4sampler_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 975.58it/s] 


exp16v6AugmixFullB4_fold0
     (51033, 1024) (27956, 1024)


100%|██████████| 8372/8372 [00:08<00:00, 940.06it/s] 


exp16v6AugmixB4Step2_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 968.73it/s] 


exp16v6AugmixB4Step2_fold0_remove_background
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 958.09it/s] 


exp16v6AugmixB4_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 962.26it/s] 


exp16v6Augmix_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 978.69it/s] 


exp16v6sampler_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 972.55it/s] 


exp21b6_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 970.35it/s] 


In [22]:
score_lis = []
pred_ids = []

for i in range(len(valid)):
    pred_list = np.array(sorted(vote_result[i].items(), key=lambda item: item[1])[::-1])[:5,0].tolist()
    
    if valid_inddividual_id_lis[i] in pred_list:
        s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
        score_lis.append(s)
    else:
        score_lis.append(0)

In [23]:
np.mean(score_lis)

0.8485985029463291