In [1]:
# =========================
# Library
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer,AutoModel,AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import logging
import sys
from contextlib import contextmanager
import time
import random
import math
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import gc
import transformers
import optuna
transformers.logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true
%matplotlib inline

2022-07-01 03:57:03.535500: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


env: TOKENIZERS_PARALLELISM=true


In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
exp = "113"
if not os.path.exists(f"../output/exp/ex{exp}"):
    os.makedirs(f"../output/exp/ex{exp}")
    os.makedirs(f"../output/exp/ex{exp}/model")
LOGGER_PATH = f"../output/exp/ex{exp}/ex_{exp}.txt"
MODEL_PATH_BASE = f"../output/exp/ex{exp}/model/ex{exp}"

In [7]:
# =========================
# Function
# =========================
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)


def objective(trial):
    w1 = trial.suggest_float('w1', 0, 1)
    w2 = trial.suggest_float('w2', 0, 1)
    w3 = trial.suggest_float('w3', 0, 1)
    w4 = trial.suggest_float('w4', 0, 1)
    w_sum1 = w1 + w2 + w3 + w4
    w1 /= w_sum1
    w2 /= w_sum1
    w3 /= w_sum1
    w4 /= w_sum1
    train_fold0["ensemble"] = train_fold0["pred"]*w1 + roberta.reshape(-1)*w2 + deberta.reshape(-1)*w3 + train_fold0["cat_pred"]*w4
    val_tr_ = train_fold0[train_fold0["ensemble"] >= 0.5].reset_index(drop=True)
    val_id = train_raw[train_raw["set"] == 0]["id"].unique()
    val_id_match = pd.DataFrame()
    val_id_match["id"] = val_id
    val_id_match["near_id"] = val_id
    val_all = pd.concat([val_id_match,val_tr_[["id","near_id"]]]).reset_index(drop=True)
    val_all_ = val_all.copy()
    val_all_.columns = ["near_id","id"]
    val_all = pd.concat([val_all,val_all_]).reset_index(drop=True)
    val_all = val_all.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
    del val_all_
    gc.collect()
    val_all = val_all.merge(train_raw[["id","point_of_interest"]],how="left",on="id").reset_index(drop=True)
    docs = val_all.groupby("id")["near_id"].apply(join)
    docs = docs.reset_index()
    docs.columns = ["id","matches"]
    score = get_score(docs)
    return score * -1

In [9]:
# ============================
# Main
# ============================
train = pd.read_csv(TRAIN_PATH)
train_fold0 = pd.read_csv("../output/exp/ex062/ex062_pred.csv")
train_fold0 = train_fold0.merge(train[["id","country"]],how="left",on="id")

In [11]:
cat = pd.read_csv("../output/exp/ex087/ex087_pred.csv")
cat.columns = ["id","near_id","cat_pred"]
train_fold0 = train_fold0.merge(cat,how="left",on=["id","near_id"])

In [14]:
deberta = np.load(f"../output/exp/ex_charm_05/oof.npy")
roberta = np.load(f"../output/exp/ex100/oof.npy")

In [15]:
train_raw = pd.read_csv(TRAIN_PATH)
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_raw, train_raw[TARGET],
                                                train_raw[TARGET])):
    train_raw.loc[val_idx, "set"] = i

In [18]:
# optuna前のcv check
w1 = 0.2
w2 = 0.25
w3 = 0.25
w4 = 0.3
train_fold0["ensemble"] = train_fold0["pred"]*w1 + roberta.reshape(-1)*w2 + deberta.reshape(-1)*w3 + train_fold0["cat_pred"]*w4
val_tr_ = train_fold0[train_fold0["ensemble"] >= 0.5].reset_index(drop=True)
val_id = train_raw[train_raw["set"] == 0]["id"].unique()
val_id_match = pd.DataFrame()
val_id_match["id"] = val_id
val_id_match["near_id"] = val_id
val_all = pd.concat([val_id_match,val_tr_[["id","near_id"]]]).reset_index(drop=True)
val_all_ = val_all.copy()
val_all_.columns = ["near_id","id"]
val_all = pd.concat([val_all,val_all_]).reset_index(drop=True)
val_all = val_all.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
del val_all_
gc.collect()
val_all = val_all.merge(train_raw[["id","point_of_interest"]],how="left",on="id").reset_index(drop=True)
id2poi = get_id2poi(val_all)
poi2ids = get_poi2ids(val_all)
docs = val_all.groupby("id")["near_id"].apply(join)
docs = docs.reset_index()
docs.columns = ["id","matches"]
score = get_score(docs)
print(score)

2022-07-01 03:58:06,969 - INFO - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-07-01 03:58:06,969 - INFO - NumExpr defaulting to 8 threads.


0.9095475145804721


In [20]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[32m[I 2022-07-01 03:58:51,518][0m A new study created in memory with name: no-name-0f34bb36-0640-4f22-b576-8c59fbc2472e[0m
[32m[I 2022-07-01 03:59:14,404][0m Trial 0 finished with value: -0.910981667589833 and parameters: {'w1': 0.9915165653375533, 'w2': 0.5847268590721764, 'w3': 0.9992870138671404, 'w4': 0.31013564779891445}. Best is trial 0 with value: -0.910981667589833.[0m
[32m[I 2022-07-01 03:59:36,351][0m Trial 1 finished with value: -0.9112399525230302 and parameters: {'w1': 0.9080613543677548, 'w2': 0.8863496118802231, 'w3': 0.8458574250239812, 'w4': 0.03455666449006689}. Best is trial 1 with value: -0.9112399525230302.[0m
[32m[I 2022-07-01 03:59:58,179][0m Trial 2 finished with value: -0.9058204647252057 and parameters: {'w1': 0.5301661727522211, 'w2': 0.054414111823996025, 'w3': 0.8921582630650577, 'w4': 0.6274623657227628}. Best is trial 1 with value: -0.9112399525230302.[0m
[32m[I 2022-07-01 04:00:20,821][0m Trial 3 finished with value: -0.9051662540115951 an

In [21]:
w1 = 0.02571305035427461
w2 = 0.5589925211363063
w3 = 0.7415007534811872
w4 = 0.6180437945722406
wsum = w1 + w2 + w3 + w4
w1 /= wsum
w2 /= wsum
w3 /= wsum
w4 /= wsum

In [22]:
print(w1,w2,w3,w4)

0.013225176172449034 0.28751060139700985 0.3813813593361608 0.31788286309438024


In [23]:
# cv check
train_fold0["ensemble"] = train_fold0["pred"]*w1 + roberta.reshape(-1)*w2 + deberta.reshape(-1)*w3 + train_fold0["cat_pred"]*w4
val_tr_ = train_fold0[train_fold0["ensemble"] >= 0.5].reset_index(drop=True)
val_id = train_raw[train_raw["set"] == 0]["id"].unique()
val_id_match = pd.DataFrame()
val_id_match["id"] = val_id
val_id_match["near_id"] = val_id
val_all = pd.concat([val_id_match,val_tr_[["id","near_id"]]]).reset_index(drop=True)
val_all_ = val_all.copy()
val_all_.columns = ["near_id","id"]
val_all = pd.concat([val_all,val_all_]).reset_index(drop=True)
val_all = val_all.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
del val_all_
gc.collect()
val_all = val_all.merge(train_raw[["id","point_of_interest"]],how="left",on="id").reset_index(drop=True)
id2poi = get_id2poi(val_all)
poi2ids = get_poi2ids(val_all)
docs = val_all.groupby("id")["near_id"].apply(join)
docs = docs.reset_index()
docs.columns = ["id","matches"]
score = get_score(docs)
print(score)

0.9116932360410808


In [24]:
# pp
docs["matches_len"] = docs["matches"].map(lambda x:len(x.split(" ")))
df_docs_3 = docs[docs["matches_len"] > 3].reset_index(drop=True)
near_id_value = df_docs_3["matches"].values
id_list = df_docs_3["id"].values
key_id_list = []
for n,key_id in tqdm(enumerate(id_list),total=len(id_list)):
    if key_id in(key_id_list):
        pass
    else:
        len_list = []
        key_near_id = near_id_value[n]
        a = len(key_near_id.split(" "))
        for near_id in near_id_value:
            b = len(near_id.split(" "))
            c = len(set(key_near_id.split(" ")) & set(near_id.split(" ")))
            len_list.append([a,b,c])
        df = pd.DataFrame(len_list)
        df.columns = ["id_len","near_id_len","common_len"]
        df["id_rate"] =  df["common_len"] / df["id_len"]
        df["near_id_rate"] = df["common_len"] / df["near_id_len"]
        df["id"] = id_list
        df = df[df["common_len"] != 0].reset_index(drop=True)
        df = df[(df["id_rate"] >= 0.5) | (df["near_id_rate"] >= 0.5)].reset_index(drop=True)
        if len(df) > 1:
            for k in df["id"]:
                key_id_list.append(k)
            all_id = near_id_value[df_docs_3["id"].isin(df["id"])]
            all_id_concat = []
            for i in all_id:
                all_id_concat += i.split(" ")
            all_id_unique = list(set(all_id_concat))
            near_id_value[df_docs_3["id"].isin(df["id"])] = " ".join(all_id_unique)

100%|██████████| 31243/31243 [25:11<00:00, 20.67it/s] 


In [25]:
df_docs_3_ = df_docs_3.copy()
df_docs_3_["matches"] = near_id_value
df_docs_under_3 = docs[docs["matches_len"] <= 3].reset_index(drop=True)
df_docs = pd.concat([df_docs_under_3[["id","matches"]],df_docs_3_[["id","matches"]]]).reset_index(drop=True)
score = get_score(df_docs)
print(score)

0.915899871335202


In [26]:
df_docs_3_.to_csv(f"../output/exp/ex{exp}/exp{exp}_pp_only.csv",index=False)
df_docs.to_csv(f"../output/exp/ex{exp}/exp{exp}_pp.csv",index=False)

In [27]:
new_pair = []
id_array = df_docs_3_["id"].values
match_array = df_docs_3_["matches"].values
for i in tqdm(range(len(df_docs_3_))):
    id_ = id_array[i]
    match_ = match_array[i]
    df_ = pd.DataFrame()
    df_["near_id"] = match_.split(" ")
    df_["id"] = id_
    new_pair.append(df_)

100%|██████████| 31243/31243 [00:44<00:00, 706.20it/s]


In [28]:
val_all["id_near_id"] = val_all["id"].astype(str) + "-" + val_all["near_id"].astype(str)
new_pair = pd.concat(new_pair).reset_index(drop=True)
new_pair["id_near_id"] = new_pair["id"].astype(str) + "-" + new_pair["near_id"].astype(str)
new_pair = new_pair[~new_pair["id_near_id"].isin(val_all["id_near_id"])].reset_index(drop=True)

In [29]:
new_pair = new_pair.merge(train_raw[["id","point_of_interest"]],how="left",on="id")
train_raw_ = train_raw[["id","point_of_interest"]].copy()
train_raw_.columns = ["near_id","near_point_of_interest"]
new_pair = new_pair.merge(train_raw_[["near_id","near_point_of_interest"]],how="left",on="near_id")
new_pair["target"] = new_pair["point_of_interest"] == new_pair["near_point_of_interest"]
new_pair["target"] = new_pair["target"].astype(int)

In [31]:
new_pair.to_csv("../output/exp/ex113/ex113_pp_new_pair_fold0.csv",index=False)