In [1]:
# =========================
# Library
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import seaborn as sns
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
import Levenshtein
import difflib
from contextlib import contextmanager
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import log_loss
import logging
warnings.filterwarnings('ignore')
import sys
sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm,train_cat_classifier
from util_tool import reduce_mem_usage
pd.set_option('display.max_columns', 300)

In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
exp = "087"
val_fold = 0
if not os.path.exists(f"../output/exp/ex{exp}"):
    os.makedirs(f"../output/exp/ex{exp}")
    os.makedirs(f"../output/exp/ex{exp}/model")
LOGGER_PATH = f"../output/exp/ex{exp}/ex_{exp}.txt"
MODEL_PATH = f"../output/exp/ex{exp}/model"

SEED = 0
CAT_PARAMS = {
   'iterations':4000,
    'learning_rate': 0.1,
    'loss_function':'Logloss',
    'random_seed':SEED,
    'metric_period':200,
    'od_wait':200,
    'depth': 9,
    }

fe_list = ["../output/fe/fe066.pkl",
           "../output/fe/fe067.pkl",
           "../output/fe/fe068.pkl",
           "../output/fe/fe069.pkl"]

In [4]:
# =========================
# Functions
# =========================

def calc_loss(y_true, y_pred):
    return  log_loss(y_true, y_pred)


@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')
    


# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)


In [5]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2022-06-16 10:36:52,356 - INFO - logger set up


<RootLogger root (DEBUG)>

In [6]:
train = pd.read_pickle(fe_list[0]) 

In [7]:
for i in range(1,len(fe_list)):
    _train = pd.read_pickle(fe_list[i])
    train = pd.concat([train,_train],axis=1)
    del _train
    gc.collect()
# train["target"] = train[TARGET] == train["near_target"]
# train["target"] = train["target"].astype(int)

In [8]:
train_tr = train[train["set"] != val_fold].reset_index(drop=True)
val_tr = train[train["set"] == val_fold].reset_index(drop=True)
del train
gc.collect()

23

In [9]:
drop_cols = ['id', 'name','match','oof', 'oof_rank',
       'address', 'city', 'state', 'zip','country', 'url', 'phone',
       'categories', 'point_of_interest', 'set',
       'near_target', 'near_id', 'near_name', 'near_address', 'near_city',
       'near_state', 'near_zip', 'near_country', 'near_url', 'near_phone',
       'near_categories','target']
features = [i for i in train_tr.columns if i not in drop_cols]

In [11]:
with timer("catboost"):
    drop_cols = []
    feature_importances = pd.DataFrame()
    categorical_features = []
    fold = 0
    with timer(f"fold {fold}"):
        x_train = train_tr[features].values.astype(np.float32)
        x_val = val_tr[features].values.astype(np.float32)
        y_train = train_tr["target"].values
        y_val = val_tr["target"].values
        print("train:",len(x_train))
        y_pred_valid, y_pred_test, valid_loss, best_iter, model = train_cat_classifier(
                    x_train, y_train, x_val, y_val,None,
                    categorical_features=categorical_features,
                    feature_name=features,
                    cat_params=CAT_PARAMS,
                    loss_func=calc_loss,
                )

        score = calc_loss(y_val,y_pred_valid),
        LOGGER.info(f'Fold{fold}:CV={score}')
        #feature_importances = pd.concat([feature_importances, importances], axis=0, sort=False)

train: 21745841




0:	learn: 0.4666030	test: 0.4663499	best: 0.4663499 (0)	total: 2.66s	remaining: 2h 57m 7s
200:	learn: 0.0361301	test: 0.0393333	best: 0.0393333 (200)	total: 7m 27s	remaining: 2h 20m 55s
400:	learn: 0.0324240	test: 0.0370057	best: 0.0370057 (400)	total: 14m 30s	remaining: 2h 10m 9s
600:	learn: 0.0302278	test: 0.0359036	best: 0.0359036 (600)	total: 21m 30s	remaining: 2h 1m 39s
800:	learn: 0.0286564	test: 0.0351624	best: 0.0351624 (800)	total: 28m 31s	remaining: 1h 53m 54s
1000:	learn: 0.0273320	test: 0.0346587	best: 0.0346587 (1000)	total: 35m 30s	remaining: 1h 46m 24s
1200:	learn: 0.0262562	test: 0.0342804	best: 0.0342804 (1200)	total: 42m 33s	remaining: 1h 39m 9s
1400:	learn: 0.0253196	test: 0.0339998	best: 0.0339998 (1400)	total: 49m 31s	remaining: 1h 31m 52s
1600:	learn: 0.0245358	test: 0.0338083	best: 0.0338081 (1599)	total: 56m 16s	remaining: 1h 24m 19s
1800:	learn: 0.0237986	test: 0.0336174	best: 0.0336174 (1800)	total: 1h 3m 10s	remaining: 1h 17m 8s
2000:	learn: 0.0231338	test: 0

2022-06-16 13:05:55,831 - INFO - Fold0:CV=(0.03283276317346528,)
2022-06-16 13:05:55,833 - INFO - [fold 0] done in 8271 s
2022-06-16 13:05:55,835 - INFO - [catboost] done in 8271 s


In [12]:
np.save(f"../output/exp/ex{exp}/oof.npy",y_pred_valid)

In [13]:
model.save_model(f"../output/exp/ex{exp}/model/model")

In [14]:
# cvのcheck

In [15]:
train_raw = pd.read_csv(TRAIN_PATH)
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_raw, train_raw[TARGET],
                                                train_raw[TARGET])):
    train_raw.loc[val_idx, "set"] = i

In [16]:
val_tr["pred"] = y_pred_valid[:,1]
val_tr[["id","near_id","pred"]].to_csv("../output/exp/ex{exp}/ex{exp}_pred.csv",index=False)
val_tr_ = val_tr[val_tr["pred"] >= 0.5].reset_index(drop=True)
#del val_tr
gc.collect()
val_id = train_raw[train_raw["set"] == val_fold]["id"].unique()
#del val_
gc.collect()
val_id_match = pd.DataFrame()
val_id_match["id"] = val_id
val_id_match["near_id"] = val_id
val_all = pd.concat([val_id_match,val_tr_[["id","near_id"]]]).reset_index(drop=True)

In [17]:
#val_all = val_all[["id","near_id"]].reset_index(drop=True)
val_all_ = val_all.copy()
val_all_.columns = ["near_id","id"]
val_all = pd.concat([val_all,val_all_]).reset_index(drop=True)
val_all = val_all.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
del val_all_
gc.collect()

23

In [18]:
train_raw = pd.read_csv(TRAIN_PATH)
val_all = val_all.merge(train_raw[["id","point_of_interest"]],how="left",on="id").reset_index(drop=True)

In [19]:
id2poi = get_id2poi(val_all)
poi2ids = get_poi2ids(val_all)

In [20]:
docs = val_all.groupby("id")["near_id"].apply(join)

In [21]:
docs = docs.reset_index()
docs.columns = ["id","matches"]
score = get_score(docs)
print(score)

0.878005066292219
