In [1]:
# =========================
# Library
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import seaborn as sns
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
import Levenshtein
import difflib
from contextlib import contextmanager
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import log_loss
import logging
warnings.filterwarnings('ignore')
import sys
sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm
from util_tool import reduce_mem_usage
pd.set_option('display.max_columns', 300)

In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
exp = "062"
val_fold = 0
if not os.path.exists(f"../output/exp/ex{exp}"):
    os.makedirs(f"../output/exp/ex{exp}")
    os.makedirs(f"../output/exp/ex{exp}/model")
LOGGER_PATH = f"../output/exp/ex{exp}/ex_{exp}.txt"
MODEL_PATH = f"../output/exp/ex{exp}/model"

SEED = 0
LGBM_PARAMS = {'num_leaves': 32,
               'min_data_in_leaf': 64,
               'objective': 'binary',
               'max_depth': -1,
               'learning_rate': 0.1,
               "boosting": "gbdt",
               "bagging_freq": 1,
               "bagging_fraction": 0.8,
               "bagging_seed": SEED,
               "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 0.7,
              'metric':"binary_logloss",
              'num_threads':56
         }

LGBM_FIT_PARAMS = {
    'num_boost_round': 3000,
    'early_stopping_rounds': 200,
    'verbose_eval': 1000,
}

fe_list = ["../output/fe/fe066.pkl",
           "../output/fe/fe067.pkl",
           "../output/fe/fe068.pkl",
           "../output/fe/fe069.pkl"]

In [4]:
# =========================
# Functions
# =========================

def calc_loss(y_true, y_pred):
    return  log_loss(y_true, y_pred)


@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')
    


# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()


def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

In [5]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2022-06-02 12:30:25,119 - INFO - logger set up


<RootLogger root (DEBUG)>

In [6]:
# =============================
# Main
# =============================
train = pd.read_pickle(fe_list[0]) 

In [7]:
for i in range(1,len(fe_list)):
    _train = pd.read_pickle(fe_list[i])
    train = pd.concat([train,_train],axis=1)
    del _train
    gc.collect()
# train["target"] = train[TARGET] == train["near_target"]
# train["target"] = train["target"].astype(int)

In [8]:
train_tr = train[train["set"] != val_fold].reset_index(drop=True)
val_tr = train[train["set"] == val_fold].reset_index(drop=True)
del train
gc.collect()

23

In [9]:
drop_cols = ['id', 'name','match','oof', 'oof_rank',
       'address', 'city', 'state', 'zip','country', 'url', 'phone',
       'categories', 'point_of_interest', 'set',
       'near_target', 'near_id', 'near_name', 'near_address', 'near_city',
       'near_state', 'near_zip', 'near_country', 'near_url', 'near_phone',
       'near_categories','target']
features = [i for i in train_tr.columns if i not in drop_cols]

In [11]:
with timer("lightgbm"):
    drop_cols = []
    feature_importances = pd.DataFrame()
    categorical_features = []
    fold = 0
    with timer(f"fold {fold}"):
        x_train = train_tr[features].values.astype(np.float32)
        x_val = val_tr[features].values.astype(np.float32)
        y_train = train_tr["target"].values
        y_val = val_tr["target"].values
        print("train:",len(x_train))
        y_pred_valid, y_pred_test, valid_loss, importances, best_iter, model = train_lgbm(
                    x_train, y_train, x_val, y_val,None,
                    categorical_features=categorical_features,
                    feature_name=features,
                    fold_id=fold,
                    lgb_params=LGBM_PARAMS,
                    fit_params=LGBM_FIT_PARAMS,
                    loss_func=calc_loss,
                    calc_importances=True
                )

        score = calc_loss(y_val,y_pred_valid),
        LOGGER.info(f'Fold{fold}:CV={score}')
        feature_importances = pd.concat([feature_importances, importances], axis=0, sort=False)

train: 21745841
Training until validation scores don't improve for 200 rounds
[1000]	training's binary_logloss: 0.026989	valid_1's binary_logloss: 0.034573
[2000]	training's binary_logloss: 0.0230933	valid_1's binary_logloss: 0.0335746
[3000]	training's binary_logloss: 0.0205002	valid_1's binary_logloss: 0.0331257
Did not meet early stopping. Best iteration is:
[3000]	training's binary_logloss: 0.0205002	valid_1's binary_logloss: 0.0331257


2022-06-02 13:02:21,950 - INFO - Fold0:CV=(0.033125739157239556,)
2022-06-02 13:02:21,952 - INFO - [fold 0] done in 1724 s
2022-06-02 13:02:21,952 - INFO - [lightgbm] done in 1724 s


In [12]:
np.save(f"../output/exp/ex{exp}/oof.npy",y_pred_valid)

In [13]:
import pickle
save_path = f"{MODEL_PATH}/lgb_fold{val_fold}.pkl"
pickle.dump(model, open(save_path, 'wb'))
model.save_model(f"{MODEL_PATH}/lgb_fold{val_fold}.txt")

<lightgbm.basic.Booster at 0x7f83131e0350>

In [None]:
# CVの計算

In [14]:
train_raw = pd.read_csv(TRAIN_PATH)
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_raw, train_raw[TARGET],
                                                train_raw[TARGET])):
    train_raw.loc[val_idx, "set"] = i

In [15]:
val_tr["pred"] = y_pred_valid
val_tr_ = val_tr[val_tr["pred"] >= 0.5].reset_index(drop=True)
#del val_tr
gc.collect()
val_id = train_raw[train_raw["set"] == val_fold]["id"].unique()
#del val_
gc.collect()
val_id_match = pd.DataFrame()
val_id_match["id"] = val_id
val_id_match["near_id"] = val_id
val_all = pd.concat([val_id_match,val_tr_[["id","near_id"]]]).reset_index(drop=True)

In [16]:
#val_all = val_all[["id","near_id"]].reset_index(drop=True)
val_all_ = val_all.copy()
val_all_.columns = ["near_id","id"]
val_all = pd.concat([val_all,val_all_]).reset_index(drop=True)
val_all = val_all.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
del val_all_
gc.collect()

23

In [17]:
train_raw = pd.read_csv(TRAIN_PATH)
val_all = val_all.merge(train_raw[["id","point_of_interest"]],how="left",on="id").reset_index(drop=True)

In [18]:
id2poi = get_id2poi(val_all)
poi2ids = get_poi2ids(val_all)

In [19]:
docs = val_all.groupby("id")["near_id"].apply(join)

In [20]:
docs = docs.reset_index()
docs.columns = ["id","matches"]
score = get_score(docs)
print(score)

0.8751669446620102
