Majority vote from s_shohei's CV

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import glob

from tqdm import tqdm
import seaborn as sns
from collections import Counter

import cv2

In [2]:
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

In [3]:
from collections import Counter
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

In [4]:
train = pd.read_csv("/kaggle/happywhale/input/train.csv")
train.shape

(51033, 3)

In [5]:
train.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

In [6]:
FOLD=4

In [7]:
fold_lis = []
for i in range(len(train)):
    fold_lis.append(i%5)
    
train["fold"] = fold_lis

In [8]:
all_only_val_set = set()

for fold in range(5):
    tr = train[train.fold!=fold]
    val = train[train.fold==fold]

    train_id_set = set(tr.individual_id)
    val_id_set = set(val.individual_id)

    only_val_set = val_id_set- train_id_set

    for val_id in list(only_val_set):
        all_only_val_set.add(str(fold) +"_"+ val_id)

In [9]:
train_df = train[train.fold!=FOLD].copy()
valid_df = train[train.fold==FOLD].copy()

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [10]:
use_val_id_during_traing = []

for val_id in valid_df.individual_id.values:

    if str(FOLD) + "_" + val_id in all_only_val_set:
        use_val_id_during_traing.append(False)
    else:
        use_val_id_during_traing.append(True)


valid_df["use_in_train"] = use_val_id_during_traing

In [11]:
from collections import defaultdict
d = defaultdict(lambda: -1)
now=0

new_label_group = []

for i,row in train_df.iterrows():
    if d[row["individual_id"]]==-1:
        d[row["individual_id"]]=now
        now+=1
    
    new_label_group.append(d[row["individual_id"]])

In [12]:
train_df["new_individual_id"]  = new_label_group

In [13]:
valid_df_for_train = valid_df[valid_df.use_in_train].copy()
valid_df_for_train = valid_df_for_train.reset_index(drop=True)

In [14]:
new_individual_id_lis = []


for val_id in valid_df_for_train.individual_id.values:
    new_individual_id_lis.append(d[val_id])

In [15]:
valid_df_for_train["new_individual_id"] = new_individual_id_lis

In [16]:
from sklearn.preprocessing import normalize
from scipy.spatial import distance

In [17]:
import pickle

def load_emb(exp_names,pickle_name):
    # train/valid   train_all/test

    max_len=10**10
    emb_lis = []
    for exp in exp_names:
        with open(exp + '/' + pickle_name + '.pickle', 'rb') as f:
            valid_preds = pickle.load(f)
        emb_lis.append(valid_preds)
        max_len = min(max_len,len(valid_preds))
    
    
    new_emb_lis = []
    for emb in emb_lis:
        new_emb_lis.append(emb[:max_len])
    return normalize(np.concatenate(new_emb_lis,axis=1))
        

In [18]:
exps_all = [
    ["aws_exp004","exp101","aws_exp010","aws_exp026","exp117"], # 0 
    ["aws_exp003","exp100","aws_exp009","aws_exp025","exp116"], # 1
    ["aws_exp002","exp099","aws_exp008","aws_exp024","exp115"], # 2
    ["aws_exp001","exp098","aws_exp007","aws_exp023","exp114"], # 3
    ["aws_exp000","exp093","aws_exp006","aws_exp022","exp113"], # 4
]
    
exps = exps_all[FOLD]
    # backfin, fullbody, ensemble, backfin_ensemble_species

### concat result

In [19]:
train_preds = load_emb(exps,"train")
valid_preds = load_emb(exps,"valid")

In [20]:
train_preds.shape,valid_preds.shape

((40827, 2560), (8041, 2560))

In [21]:
%%time

distances = pairwise_distances(valid_preds,train_preds)

CPU times: user 56 s, sys: 43.4 s, total: 1min 39s
Wall time: 8.78 s


In [22]:
%%time

K=1000
predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]

CPU times: user 2.58 s, sys: 278 ms, total: 2.86 s
Wall time: 2.86 s


In [23]:
score_lis = []
pred_ids = []

hit_species=defaultdict(int)
count_species=defaultdict(int)

train_inddividual_id_lis = train_df["individual_id"].values
valid_inddividual_id_lis = valid_df_for_train["individual_id"].values

valid_species_lis =  valid_df_for_train["species"].values

rank_lis=[]

for i in tqdm(range(predicted_positions.shape[0])):
    nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
    nearest.sort(key=lambda x: x[1])

    prediction = [index_id for index_id, d in nearest]
    
    done = set()
    pred_list = []
    for pred_id in prediction:
        if pred_id in done:
            continue
        done.add(pred_id)
        pred_list.append(pred_id)
        if len(pred_list)==5:
            break
    pred_ids.append(pred_list)
    
    if valid_inddividual_id_lis[i] in pred_list:
        s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
        score_lis.append(s)
        hit_species[valid_species_lis[i]]+=s
        rank_lis.append(pred_list.index(valid_inddividual_id_lis[i]))
    else:
        score_lis.append(0)
        rank_lis.append(5)
    
    count_species[valid_species_lis[i]]+=1

100%|██████████| 8041/8041 [00:08<00:00, 932.17it/s]


In [24]:
np.mean(score_lis)

0.8603366082162252

In [25]:
spe_lis = []
score_lis = []
count_lis = []

for k,v in hit_species.items():
    count = count_species[k]
    score = v/count
    
    spe_lis.append(k)
    score_lis.append(score)
    count_lis.append(count)

In [26]:
tmp = list(zip(spe_lis, score_lis, count_lis))
original_df = pd.DataFrame(tmp, columns=['species','score',"count"])

### majoriy vote result

In [27]:
def do_vote(exps,vote_result):
    train_preds = load_emb(exps,"train")
    valid_preds = load_emb(exps,"valid")
    
    distances = pairwise_distances(valid_preds,train_preds)
    
    predicted_positions = np.argpartition(distances, K, axis=1)[:, :K]
    
    for i in tqdm(range(predicted_positions.shape[0])):
        nearest = [(train_inddividual_id_lis[j], distances[i, j]) for j in predicted_positions[i]]
        nearest.sort(key=lambda x: x[1])

        prediction = [index_id for index_id, d in nearest]
        
        done = set()
        pred_list = []
        for pred_id in prediction:
            if pred_id in done:
                continue
            done.add(pred_id)
            pred_list.append(pred_id)
            if len(pred_list)==20:
                break

        for j,p in enumerate(pred_list):
            vote_result[i][p] += 1/(j+1)
        

In [28]:
vote_result = defaultdict(lambda:defaultdict(int))
for i in range(len(exps)):
    do_vote([exps[i]],vote_result)

100%|██████████| 8041/8041 [00:08<00:00, 943.56it/s]
100%|██████████| 8041/8041 [00:08<00:00, 935.41it/s]
100%|██████████| 8041/8041 [00:08<00:00, 920.38it/s]
100%|██████████| 8041/8041 [00:08<00:00, 921.53it/s]
100%|██████████| 8041/8041 [00:08<00:00, 928.85it/s]


In [29]:
score_lis = []
pred_ids = []
corredt_ids  = []
hit_species=defaultdict(int)
count_species=defaultdict(int)

for i in range(len(valid_df_for_train)):
    pred_list = np.array(sorted(vote_result[i].items(), key=lambda item: item[1])[::-1])[:5,0].tolist()
    
    if valid_inddividual_id_lis[i] in pred_list:
        s = 1/(pred_list.index(valid_inddividual_id_lis[i]) + 1)
        score_lis.append(s)
        hit_species[valid_species_lis[i]]+=s
    else:
        score_lis.append(0)
    
    count_species[valid_species_lis[i]]+=1

In [30]:
np.mean(score_lis)

0.8528271773825808

In [31]:
spe_lis = []
score_lis = []
count_lis = []

for k,v in hit_species.items():
    count = count_species[k]
    score = v/count
    
    spe_lis.append(k)
    score_lis.append(score)
    count_lis.append(count)

In [32]:
tmp = list(zip(spe_lis, score_lis, count_lis))
vote_df = pd.DataFrame(tmp, columns=['species','score',"count"])

In [33]:
merged_df = pd.merge(original_df,vote_df,on=["species","count"],suffixes=["_concat","_vote"])
merged_df["diff"] = merged_df["score_concat"] - merged_df["score_vote"] 

In [34]:
merged_df

Unnamed: 0,species,score_concat,count,score_vote,diff
0,humpback_whale,0.866028,1044,0.85811,0.007918
1,bottlenose_dolphin,0.975196,2124,0.971288,0.003908
2,minke_whale,0.932195,322,0.931418,0.000776
3,false_killer_whale,0.992611,609,0.992611,0.0
4,beluga,0.751809,1382,0.739689,0.01212
5,blue_whale,0.548128,730,0.528995,0.019132
6,long_finned_pilot_whale,0.967742,31,0.967742,0.0
7,killer_whale,0.933555,451,0.926275,0.00728
8,spinner_dolphin,0.940855,226,0.936209,0.004646
9,common_dolphin,0.830357,56,0.857143,-0.026786


In [35]:
merged_df.query("count > 100")

Unnamed: 0,species,score_concat,count,score_vote,diff
0,humpback_whale,0.866028,1044,0.85811,0.007918
1,bottlenose_dolphin,0.975196,2124,0.971288,0.003908
2,minke_whale,0.932195,322,0.931418,0.000776
3,false_killer_whale,0.992611,609,0.992611,0.0
4,beluga,0.751809,1382,0.739689,0.01212
5,blue_whale,0.548128,730,0.528995,0.019132
7,killer_whale,0.933555,451,0.926275,0.00728
8,spinner_dolphin,0.940855,226,0.936209,0.004646
10,dusky_dolphin,0.873109,119,0.848319,0.02479
11,gray_whale,0.789469,201,0.781095,0.008375
