Majority vote from team's CV

In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import glob

from tqdm import tqdm
import seaborn as sns
from collections import Counter

import cv2

In [13]:
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

In [14]:
from collections import Counter
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

In [15]:
from collections import defaultdict

In [16]:
import bz2
import pickle
import _pickle as cPickle

def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f:
        cPickle.dump(data, f)
        
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

In [30]:
def get_embeddings_weighted(files,weight):
    DIRNAME ="../input/whale-21-3-2022/"
  
    train_emb_lis = []
    test_emb_lis  = []
    
    for fname,w in zip(files,weight):
        wei=w**2
        print(fname,wei)
        data = decompress_pickle(DIRNAME + fname + ".pbz2")
        
        train_emb = data["emb_val"]#[new_index,:]
        test_emb  = data["emb_test"]
        
        train_emb=normalize(train_emb)
        test_emb=normalize(test_emb)

        train_emb*=wei
        test_emb*=wei
        
        train_emb_lis.append(train_emb)
        test_emb_lis.append(test_emb)
        
        print("    ",train_emb.shape,test_emb.shape)
    
    ret_train = normalize(np.concatenate(train_emb_lis,axis=1))
    ret_test  = normalize(np.concatenate(test_emb_lis,axis=1))
        
    return ret_train,ret_test
    

In [31]:
def get_embeddings(files):
    DIRNAME ="../input/whale-21-3-2022/"
  
    train_emb_lis = []
    test_emb_lis  = []
    
    for fname in files:
        print(fname)
        data = decompress_pickle(DIRNAME + fname + ".pbz2")
        
        train_emb = data["emb_val"]#[new_index,:]
        test_emb  = data["emb_test"]
        
        train_emb=normalize(train_emb)
        test_emb=normalize(test_emb)

        train_emb_lis.append(train_emb)
        test_emb_lis.append(test_emb)
        
        print("    ",train_emb.shape,test_emb.shape)
    
    ret_train = normalize(np.concatenate(train_emb_lis,axis=1))
    ret_test  = normalize(np.concatenate(test_emb_lis,axis=1))
        
    return ret_train,ret_test
    

In [32]:
def get_valid_score(distances):
    
    K=1000
    predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]

    
    score_lis = []

    train_inddividual_id_lis = train["individual_id"].values
    valid_inddividual_id_lis = valid["individual_id"].values

    for i in tqdm(range(predicted_positions.shape[0])):
        nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
        nearest.sort(key=lambda x: x[1])

        prediction = [index_id for index_id, d in nearest]

        done = set()
        pred_list = []
        for pred_id in prediction:
            if pred_id in done:
                continue
            done.add(pred_id)
            pred_list.append(pred_id)
            if len(pred_list)==5:
                break

        if valid_inddividual_id_lis[i] in pred_list:
            s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
            score_lis.append(s)
        else:
            score_lis.append(0)    
            
    return score_lis

In [33]:
d={
    "exp16v4sampler_fold0": 0.8045250039815257,
    "exp16v6AugmixFullB4_fold0": 0.845,
    "exp16v6AugmixB4Step2_fold0": 0.8076166587036152,
    "exp16v6AugmixB4Step2_fold0_remove_background": 0.8076166587036152,
    "exp16v6AugmixB4_fold0": 0.808689679885332,
    "exp16v6Augmix_fold0": 0.8142837235228539,
    "exp16v6sampler_fold0": 0.8119127249562033,
    "exp21b6_fold0": 0.7997491638795986,}

In [34]:
exps = list(d.keys())
exps

['exp16v4sampler_fold0',
 'exp16v6AugmixFullB4_fold0',
 'exp16v6AugmixB4Step2_fold0',
 'exp16v6AugmixB4Step2_fold0_remove_background',
 'exp16v6AugmixB4_fold0',
 'exp16v6Augmix_fold0',
 'exp16v6sampler_fold0',
 'exp21b6_fold0']

In [35]:
weights = list(d.values())

In [36]:
weights

[0.8045250039815257,
 0.845,
 0.8076166587036152,
 0.8076166587036152,
 0.808689679885332,
 0.8142837235228539,
 0.8119127249562033,
 0.7997491638795986]

In [37]:
FOLD=0

In [38]:
fold2 = pd.read_csv("/kaggle/happywhale/input/whale-21-3-2022/fold2.csv")
fold2.shape

(51033, 8)

In [39]:
valid = fold2.query("fold == @FOLD").reset_index()
train = fold2.query("fold != @FOLD").reset_index()

In [40]:
valid_ind = (fold2.fold == FOLD).values
train_ind = (fold2.fold != FOLD).values

### concat result not weighed

In [41]:
train_emb, _ = get_embeddings(exps)

exp16v4sampler_fold0
     (51033, 512) (27956, 512)
exp16v6AugmixFullB4_fold0
     (51033, 1024) (27956, 1024)
exp16v6AugmixB4Step2_fold0
     (51033, 512) (27956, 512)
exp16v6AugmixB4Step2_fold0_remove_background
     (51033, 512) (27956, 512)
exp16v6AugmixB4_fold0
     (51033, 512) (27956, 512)
exp16v6Augmix_fold0
     (51033, 512) (27956, 512)
exp16v6sampler_fold0
     (51033, 512) (27956, 512)
exp21b6_fold0
     (51033, 512) (27956, 512)


In [42]:
valid_preds = train_emb[valid_ind]
train_preds = train_emb[train_ind]

In [43]:
train_preds.shape,valid_preds.shape

((42661, 4608), (8372, 4608))

In [44]:
distances = pairwise_distances(valid_preds,train_preds)

In [45]:
score_lis = get_valid_score(distances)

100%|██████████| 8372/8372 [00:08<00:00, 1001.95it/s]


In [46]:
np.mean(score_lis)

0.8523590539894887

### concat result weighed

In [48]:
train_emb, _ = get_embeddings_weighted(exps,weights)

exp16v4sampler_fold0 0.6472604820314739
     (51033, 512) (27956, 512)
exp16v6AugmixFullB4_fold0 0.7140249999999999
     (51033, 1024) (27956, 1024)
exp16v6AugmixB4Step2_fold0 0.6522446674155917
     (51033, 512) (27956, 512)
exp16v6AugmixB4Step2_fold0_remove_background 0.6522446674155917
     (51033, 512) (27956, 512)
exp16v6AugmixB4_fold0 0.6539789983530407
     (51033, 512) (27956, 512)
exp16v6Augmix_fold0 0.6630579823942436
     (51033, 512) (27956, 512)
exp16v6sampler_fold0 0.6592022729458074
     (51033, 512) (27956, 512)
exp21b6_fold0 0.639598725126117
     (51033, 512) (27956, 512)


In [49]:
valid_preds = train_emb[valid_ind]
train_preds = train_emb[train_ind]

In [50]:
train_preds.shape,valid_preds.shape

((42661, 4608), (8372, 4608))

In [51]:
distances = pairwise_distances(valid_preds,train_preds)

In [52]:
score_lis = get_valid_score(distances)

100%|██████████| 8372/8372 [00:08<00:00, 979.00it/s] 


In [53]:
np.mean(score_lis)

0.8536610129001433

### majoriy vote result not weighted

In [54]:
train_inddividual_id_lis = train["individual_id"].values
valid_inddividual_id_lis = valid["individual_id"].values
def do_vote(exps, vote_result):
    
    train_emb, _ = get_embeddings(exps)
    
    valid_preds = train_emb[valid_ind]
    train_preds = train_emb[train_ind]

    
    distances = pairwise_distances(valid_preds,train_preds)
    
    K=1000
    predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]
    
    for i in tqdm(range(predicted_positions.shape[0])):
        nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
        nearest.sort(key=lambda x: x[1])

        prediction = [index_id for index_id, d in nearest]
        
        done = set()
        pred_list = []
        for pred_id in prediction:
            if pred_id in done:
                continue
            done.add(pred_id)
            pred_list.append(pred_id)
            if len(pred_list)==20:
                break

        for j,p in enumerate(pred_list):
            vote_result[i][p] += 1/(j+1)
        

In [55]:
vote_result = defaultdict(lambda:defaultdict(int))
for i in range(len(exps)):
    do_vote([exps[i]],vote_result)

exp16v4sampler_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 953.19it/s] 


exp16v6AugmixFullB4_fold0
     (51033, 1024) (27956, 1024)


100%|██████████| 8372/8372 [00:08<00:00, 997.21it/s] 


exp16v6AugmixB4Step2_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 980.08it/s] 


exp16v6AugmixB4Step2_fold0_remove_background
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 985.34it/s] 


exp16v6AugmixB4_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 988.78it/s] 


exp16v6Augmix_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 992.57it/s] 


exp16v6sampler_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 973.98it/s] 


exp21b6_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 988.54it/s] 


In [56]:
score_lis = []
pred_ids = []

for i in range(len(valid)):
    pred_list = np.array(sorted(vote_result[i].items(), key=lambda item: item[1])[::-1])[:5,0].tolist()
    
    if valid_inddividual_id_lis[i] in pred_list:
        s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
        score_lis.append(s)
    else:
        score_lis.append(0)

In [57]:
np.mean(score_lis)

0.8485985029463291

### majoriy vote result weighted

In [64]:
train_inddividual_id_lis = train["individual_id"].values
valid_inddividual_id_lis = valid["individual_id"].values
def do_vote_weighted(exps, vote_result,weight):
    
    wei=weight**2
    print(exps,wei)
    
    train_emb, _ = get_embeddings(exps)
    
    valid_preds = train_emb[valid_ind]
    train_preds = train_emb[train_ind]

    
    distances = pairwise_distances(valid_preds,train_preds)
    
    K=1000
    predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]
    
    for i in tqdm(range(predicted_positions.shape[0])):
        nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
        nearest.sort(key=lambda x: x[1])

        prediction = [index_id for index_id, d in nearest]
        
        done = set()
        pred_list = []
        for pred_id in prediction:
            if pred_id in done:
                continue
            done.add(pred_id)
            pred_list.append(pred_id)
            if len(pred_list)==20:
                break

        for j,p in enumerate(pred_list):
            vote_result[i][p] += wei * 1/(j+1)
        

In [65]:
vote_result = defaultdict(lambda:defaultdict(int))
for i in range(len(exps)):
    do_vote_weighted([exps[i]],vote_result,weights[i])

['exp16v4sampler_fold0'] 0.6472604820314739
exp16v4sampler_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 980.21it/s] 


['exp16v6AugmixFullB4_fold0'] 0.7140249999999999
exp16v6AugmixFullB4_fold0
     (51033, 1024) (27956, 1024)


100%|██████████| 8372/8372 [00:08<00:00, 992.15it/s] 


['exp16v6AugmixB4Step2_fold0'] 0.6522446674155917
exp16v6AugmixB4Step2_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 970.57it/s] 


['exp16v6AugmixB4Step2_fold0_remove_background'] 0.6522446674155917
exp16v6AugmixB4Step2_fold0_remove_background
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 992.62it/s] 


['exp16v6AugmixB4_fold0'] 0.6539789983530407
exp16v6AugmixB4_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 983.37it/s] 


['exp16v6Augmix_fold0'] 0.6630579823942436
exp16v6Augmix_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 984.96it/s] 


['exp16v6sampler_fold0'] 0.6592022729458074
exp16v6sampler_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 992.88it/s] 


['exp21b6_fold0'] 0.639598725126117
exp21b6_fold0
     (51033, 512) (27956, 512)


100%|██████████| 8372/8372 [00:08<00:00, 984.65it/s] 


In [66]:
score_lis = []
pred_ids = []

for i in range(len(valid)):
    pred_list = np.array(sorted(vote_result[i].items(), key=lambda item: item[1])[::-1])[:5,0].tolist()
    
    if valid_inddividual_id_lis[i] in pred_list:
        s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
        score_lis.append(s)
    else:
        score_lis.append(0)

In [67]:
np.mean(score_lis)

0.8494286510590858