In [1]:
# =========================
# Library
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer,AutoModel,AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import logging
import sys
from contextlib import contextmanager
import time
import random
import math
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import gc
import transformers
transformers.logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true
%matplotlib inline

2022-07-07 20:48:29.726571: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


env: TOKENIZERS_PARALLELISM=true


In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
exp = "113"
if not os.path.exists(f"../output/exp/ex{exp}"):
    os.makedirs(f"../output/exp/ex{exp}")
    os.makedirs(f"../output/exp/ex{exp}/model")
LOGGER_PATH = f"../output/exp/ex{exp}/ex_{exp}.txt"
MODEL_PATH_BASE = f"../output/exp/ex{exp}/model/ex{exp}"
MODEL_PATH = "microsoft/mdeberta-v3-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

val_fold = 0 

In [4]:
# ===============
# Functions
# ===============
def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')

In [7]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

def id_sort(id_near_id):
    id_near_id = "-".join(sorted(id_near_id.split("-")))
    return id_near_id

In [8]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2022-07-07 20:48:36,197 - INFO - logger set up


<RootLogger root (DEBUG)>

In [9]:
# ============================
# Main
# ============================
train = pd.read_csv(TRAIN_PATH)
train_fold0 = pd.read_csv("../output/exp/ex062/ex062_pred.csv")
train_fold0 = train_fold0.merge(train[["id","country"]],how="left",on="id")

In [10]:
cat = pd.read_csv("../output/exp/ex087/ex087_pred.csv")
cat.columns = ["id","near_id","cat_pred"]
train_fold0 = train_fold0.merge(cat,how="left",on=["id","near_id"])

In [14]:
deberta = np.load(f"../output/exp/ex_charm_05/oof.npy")
roberta = np.load(f"../output/exp/ex100/oof.npy")

In [15]:
train_raw = pd.read_csv(TRAIN_PATH)
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_raw, train_raw[TARGET],
                                                train_raw[TARGET])):
    train_raw.loc[val_idx, "set"] = i

In [18]:
w1 = 0.013225176172449034
w2 = 0.28751060139700985
w3 = 0.3813813593361608
w4 = 0.31788286309438024

In [19]:
train_fold0["ensemble"] = train_fold0["pred"]*w1 + roberta.reshape(-1)*w2 + deberta.reshape(-1)*w3 + train_fold0["cat_pred"]*w4
val_tr_ = train_fold0[train_fold0["ensemble"] >= 0.5].reset_index(drop=True)
val_id = train_raw[train_raw["set"] == 0]["id"].unique()
val_id_match = pd.DataFrame()
val_id_match["id"] = val_id
val_id_match["near_id"] = val_id
val_all = pd.concat([val_id_match,val_tr_[["id","near_id"]]]).reset_index(drop=True)
val_all_ = val_all.copy()
val_all_.columns = ["near_id","id"]
val_all = pd.concat([val_all,val_all_]).reset_index(drop=True)
val_all = val_all.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
del val_all_
gc.collect()
val_all = val_all.merge(train_raw[["id","point_of_interest"]],how="left",on="id").reset_index(drop=True)

2022-07-07 20:49:44,620 - INFO - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-07-07 20:49:44,621 - INFO - NumExpr defaulting to 8 threads.


In [20]:
# add new pair
train_fold0_pp = pd.read_csv("../output/exp/ex113/ex113_pp_new_pair_fold0.csv")
pp_pred = np.load(f"../output/exp/ex{exp}/oof.npy")
train_fold0_pp["pp_pred"] = pp_pred.reshape(-1)

In [22]:
train_fold0_pp["id_near_id_sort"] = train_fold0_pp["id_near_id"].map(id_sort)

In [25]:
train_fold0_pp = train_fold0_pp.drop_duplicates(subset = "id_near_id_sort").reset_index(drop=True)

In [26]:
# Threshold adjustment
for i in [0.01,0.02,0.03]:
    train_fold0_pp_ = train_fold0_pp[train_fold0_pp["pp_pred"] > i][["id","near_id"]]
    train_fold0_pp__ = train_fold0_pp_.copy()
    train_fold0_pp__.columns = ["near_id","id"]
    train_fold0_pp_ = pd.concat([train_fold0_pp_,train_fold0_pp__]).reset_index(drop=True)
    train_fold0_pp_ = train_fold0_pp_.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
    train_fold0_pp_ = train_fold0_pp_.merge(train_raw[["id","point_of_interest"]],how="left",on="id")
    val_all_ = pd.concat([val_all,train_fold0_pp_[["id","near_id","point_of_interest"]]])
    id2poi = get_id2poi(val_all_)
    poi2ids = get_poi2ids(val_all_)
    docs = val_all_.groupby("id")["near_id"].apply(join)
    docs = docs.reset_index()
    docs.columns = ["id","matches"]
    score = get_score(docs)
    print(i,score)

0.01 0.9166754754547339
0.02 0.9166534470005328
0.03 0.9166095356061639
