Majority vote from team's CV

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import glob

from tqdm import tqdm
import seaborn as sns
from collections import Counter

import cv2

In [2]:
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

In [3]:
from collections import Counter
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

In [4]:
from collections import defaultdict

In [5]:
import bz2
import pickle
import _pickle as cPickle

def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f:
        cPickle.dump(data, f)
        
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

In [6]:
d={
    "exp16v4sampler_fold0": 0.8045250039815257,
    "exp16v6AugmixFullB4_fold0": 0.845,
    "exp16v6AugmixB4Step2_fold0": 0.8076166587036152,
    "exp16v6AugmixB4Step2_fold0_remove_background": 0.8076166587036152,
    "exp16v6AugmixB4_fold0": 0.808689679885332,
    "exp16v6Augmix_fold0": 0.8142837235228539,
    "exp16v6sampler_fold0": 0.8119127249562033,
    "exp21b6_fold0": 0.7997491638795986,}

In [7]:
exps = list(d.keys())
exps

['exp16v4sampler_fold0',
 'exp16v6AugmixFullB4_fold0',
 'exp16v6AugmixB4Step2_fold0',
 'exp16v6AugmixB4Step2_fold0_remove_background',
 'exp16v6AugmixB4_fold0',
 'exp16v6Augmix_fold0',
 'exp16v6sampler_fold0',
 'exp21b6_fold0']

In [8]:
weights = list(d.values())

In [9]:
weights

[0.8045250039815257,
 0.845,
 0.8076166587036152,
 0.8076166587036152,
 0.808689679885332,
 0.8142837235228539,
 0.8119127249562033,
 0.7997491638795986]

In [10]:
FOLD=0

In [11]:
fold2 = pd.read_csv("/kaggle/happywhale/input/whale-21-3-2022/fold2.csv")
fold2.shape

(51033, 8)

In [12]:
valid = fold2.query("fold == @FOLD").reset_index()
train = fold2.query("fold != @FOLD").reset_index()

In [13]:
valid_ind = (fold2.fold == FOLD).values
train_ind = (fold2.fold != FOLD).values

In [14]:
train_inddividual_id_lis = train["individual_id"].values
valid_inddividual_id_lis = valid["individual_id"].values

def get_result_each(files):
    DIRNAME ="../input/whale-21-3-2022/"
      
    ret = []
        
    for fname in files:
        print(fname)
        data = decompress_pickle(DIRNAME + fname + ".pbz2")
        
        train_emb = data["emb_val"]#[new_index,:]
        
        train_emb=normalize(train_emb)
        
        valid_preds = train_emb[valid_ind]
        train_preds = train_emb[train_ind]
        
        
        distances = pairwise_distances(valid_preds,train_preds)
                
        K=1000
        predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]

        all_pred_lis = []
        for i in tqdm(range(predicted_positions.shape[0])):
            nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
            nearest.sort(key=lambda x: x[1])

            prediction = [index_id for index_id, d in nearest]

            done = set()
            pred_list = []
            for pred_id in prediction:
                if pred_id in done:
                    continue
                done.add(pred_id)
                pred_list.append(pred_id)
                if len(pred_list)==20:
                    break

            all_pred_lis.append(pred_list)
        
        ret.append(all_pred_lis)
        
    return ret
    

### get result for each model

In [15]:
ret = get_result_each(exps)

exp16v4sampler_fold0


100%|██████████| 8372/8372 [00:08<00:00, 998.19it/s] 


exp16v6AugmixFullB4_fold0


100%|██████████| 8372/8372 [00:08<00:00, 1004.01it/s]


exp16v6AugmixB4Step2_fold0


100%|██████████| 8372/8372 [00:08<00:00, 992.96it/s] 


exp16v6AugmixB4Step2_fold0_remove_background


100%|██████████| 8372/8372 [00:08<00:00, 987.29it/s] 


exp16v6AugmixB4_fold0


100%|██████████| 8372/8372 [00:08<00:00, 996.72it/s] 


exp16v6Augmix_fold0


100%|██████████| 8372/8372 [00:08<00:00, 1003.82it/s]


exp16v6sampler_fold0


100%|██████████| 8372/8372 [00:08<00:00, 981.05it/s] 


exp21b6_fold0


100%|██████████| 8372/8372 [00:08<00:00, 1000.57it/s]


In [16]:
len(ret),len(ret[0]),len(ret[0][0])

(8, 8372, 20)

In [17]:
def get_score(weights):
    
    #train_emb
    vote_result = defaultdict(lambda:defaultdict(int))
    
    for preds, w in zip(ret, weights):
        for i in range(len(preds)):
            
            for j,p in enumerate(preds[i]):
                vote_result[i][p] += (w / (j+1))
        
        
    score_lis = []
    pred_ids = []

    for i in range(len(ret[0])):
        pred_list = np.array(sorted(vote_result[i].items(), key=lambda item: item[1])[::-1])[:5,0].tolist()

        if valid_inddividual_id_lis[i] in pred_list:
            s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
            score_lis.append(s)
        else:
            score_lis.append(0)
    
    return np.mean(score_lis)

In [18]:
# un-weighted result
get_score([1,1,1,1,1,1,1,1])

0.8485985029463291

In [19]:
square_weights= np.array(weights) ** 2
square_weights

array([0.64726048, 0.714025  , 0.65224467, 0.65224467, 0.653979  ,
       0.66305798, 0.65920227, 0.63959873])

In [20]:
# current weighted result
get_score(square_weights)

0.8494286510590858

optimize by optuna

In [21]:
import optuna

In [22]:
def objective(trial):
    w1=1 # exp16v4sampler_fold0
    w2=trial.suggest_uniform('exp16v6AugmixFullB4_fold0', 0, 4)
    w3=trial.suggest_uniform('exp16v6AugmixB4Step2_fold0', 0, 4)
    w4=trial.suggest_uniform('exp16v6AugmixB4Step2_fold0_remove_background', 0, 4)
    w5=trial.suggest_uniform('exp16v6AugmixB4_fold0', 0, 4)
    w6=trial.suggest_uniform('exp16v6Augmix_fold0', 0, 4)
    w7=trial.suggest_uniform('exp16v6sampler_fold0', 0, 4)
    w8=trial.suggest_uniform('exp21b6_fold0', 0, 4)

    result = get_score([w1,w2,w3,w4,w5,w6,w7,w8])
    
    return result

In [23]:
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

In [24]:
%%time

study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=300)

CPU times: user 5min, sys: 1.02 s, total: 5min 2s
Wall time: 5min


In [26]:
study.trials_dataframe().sort_values("value")

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_exp16v6AugmixB4Step2_fold0,params_exp16v6AugmixB4Step2_fold0_remove_background,params_exp16v6AugmixB4_fold0,params_exp16v6AugmixFullB4_fold0,params_exp16v6Augmix_fold0,params_exp16v6sampler_fold0,params_exp21b6_fold0,state
12,12,0.837888,2022-04-14 10:39:35.787712,2022-04-14 10:39:37.139674,0 days 00:00:01.351962,0.084647,0.099079,0.750099,0.032923,2.456566,0.943456,0.128483,COMPLETE
60,60,0.841346,2022-04-14 10:40:40.768276,2022-04-14 10:40:42.122046,0 days 00:00:01.353770,0.711353,1.613619,0.122017,0.352236,0.938728,0.553158,0.675886,COMPLETE
6,6,0.841472,2022-04-14 10:39:27.774516,2022-04-14 10:39:29.103371,0 days 00:00:01.328855,2.955323,0.272507,2.322118,0.677254,1.340452,1.027816,3.648354,COMPLETE
220,220,0.841951,2022-04-14 10:44:15.186656,2022-04-14 10:44:16.555970,0 days 00:00:01.369314,0.468293,0.233640,0.273210,0.020262,1.153548,1.030179,1.146683,COMPLETE
9,9,0.844394,2022-04-14 10:39:31.761399,2022-04-14 10:39:33.088843,0 days 00:00:01.327444,3.928293,0.690110,3.264435,2.043607,1.416588,2.033151,3.312023,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,176,0.866529,2022-04-14 10:43:16.390586,2022-04-14 10:43:17.724527,0 days 00:00:01.333941,0.501546,0.025801,0.371485,3.683933,1.165693,1.067964,1.480508,COMPLETE
156,156,0.866549,2022-04-14 10:42:49.548612,2022-04-14 10:42:50.871360,0 days 00:00:01.322748,0.167212,0.172105,0.753264,3.882390,1.258520,1.467853,1.676132,COMPLETE
174,174,0.866553,2022-04-14 10:43:13.710877,2022-04-14 10:43:15.048891,0 days 00:00:01.338014,0.469135,0.018991,0.394244,3.581148,1.193448,1.006749,1.471483,COMPLETE
222,222,0.866597,2022-04-14 10:44:17.890017,2022-04-14 10:44:19.220022,0 days 00:00:01.330005,0.476479,0.121882,0.350760,3.586031,1.083781,1.111681,1.421671,COMPLETE


In [31]:
study.best_params

{'exp16v6AugmixFullB4_fold0': 3.566004627617216,
 'exp16v6AugmixB4Step2_fold0': 0.5719668582219408,
 'exp16v6AugmixB4Step2_fold0_remove_background': 0.08667861670876892,
 'exp16v6AugmixB4_fold0': 0.3485647134305221,
 'exp16v6Augmix_fold0': 1.0612858569221244,
 'exp16v6sampler_fold0': 1.1133698495346793,
 'exp21b6_fold0': 1.4286739060092204}

In [32]:
best_weights = list(study.best_params.values())

In [34]:
get_score([1] + best_weights)

0.8666328236980411