In [1]:
# =========================
# Library
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import seaborn as sns
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
import Levenshtein
import difflib
from contextlib import contextmanager
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import log_loss
import pickle
import logging
warnings.filterwarnings('ignore')
import sys
sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm
from util_tool import reduce_mem_usage
pd.set_option('display.max_columns', 300)

In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
exp = "061"
if not os.path.exists(f"../output/exp/ex{exp}"):
    os.makedirs(f"../output/exp/ex{exp}")
    os.makedirs(f"../output/exp/ex{exp}/model")

LOGGER_PATH = f"../output/exp/ex{exp}/ex_{exp}.txt"
MODEL_PATH = f"../output/exp/ex{exp}/model"

SEED = 0
LGBM_PARAMS = {'num_leaves': 32,
               'min_data_in_leaf': 64,
               'objective': 'binary',
               'max_depth': -1,
               'learning_rate': 0.1,
               "boosting": "gbdt",
               "bagging_freq": 1,
               "bagging_fraction": 0.8,
               "bagging_seed": SEED,
               "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 0.7,
              'metric':"binary_logloss",
              'num_threads':56
         }

LGBM_FIT_PARAMS = {
    'num_boost_round': 1000,
    'early_stopping_rounds': 300,
    'verbose_eval': 500,
}

fe_list = ["../output/fe/fe065.pkl",]

In [4]:
# =========================
# Functions
# =========================

def calc_loss(y_true, y_pred):
    return  log_loss(y_true, y_pred)


@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')

In [5]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2022-06-02 04:49:28,505 - INFO - logger set up


<RootLogger root (DEBUG)>

In [6]:
# =============================
# Main
# =============================
train = pd.read_pickle(fe_list[0]) 

In [7]:
train["target"] = train[TARGET] == train["near_target"]
train["target"] = train["target"].astype(int)

In [8]:
y = train["target"].values

In [9]:
fold_array = train["set"].values.astype(int)

In [10]:
drop_cols = ['id', 'name','name_num',
       'address', 'city', 'state', 'zip','country', 'url', 'phone',
       'categories', 'point_of_interest', 'set',
       'near_target', 'near_id', 'near_name', 'near_address', 'near_city',
       'near_state', 'near_zip', 'near_country', 'near_url', 'near_phone',
       'near_categories','target','match', 'id_near_id']
features = [i for i in train.columns if i not in drop_cols]
print(features)

['latitude', 'longitude', 'rank', 'd_near', 'near_latitude', 'near_longitude', 'name_jaro', 'distance']


In [11]:
with timer("lightgbm"):
    drop_cols = []
    feature_importances = pd.DataFrame()
    categorical_features = []
    y_oof = np.empty([len(train),])
    for fold in range(2):
        with timer(f"fold {fold}"):
            x_train, y_train = train[fold_array != fold][features].values.astype(np.float32), y[fold_array != fold]
            x_val, y_val = train[fold_array == fold][features].values.astype(np.float32), y[fold_array == fold]
            y_pred_valid, y_pred_test, valid_loss, importances, best_iter, model = train_lgbm(
                        x_train, y_train, x_val, y_val,None,
                        categorical_features=categorical_features,
                        feature_name=features,
                        fold_id=fold,
                        lgb_params=LGBM_PARAMS,
                        fit_params=LGBM_FIT_PARAMS,
                        loss_func=calc_loss,
                        calc_importances=True
                    )
            y_oof[fold_array == fold] = y_pred_valid
            score = calc_loss(y_val,y_pred_valid),
            LOGGER.info(f'Fold{fold}:CV={score}')
            feature_importances = pd.concat([feature_importances, importances], axis=0, sort=False)
            save_path = f"{MODEL_PATH}/lgb_fold{fold}.pkl"
            pickle.dump(model, open(save_path, 'wb'))
            model.save_model(f"{MODEL_PATH}/lgb_fold{fold}.txt")
    np.save(f"../output/exp/ex{exp}/oof.npy",y_oof)

Training until validation scores don't improve for 300 rounds
[500]	training's binary_logloss: 0.0141064	valid_1's binary_logloss: 0.0192384
[1000]	training's binary_logloss: 0.0130175	valid_1's binary_logloss: 0.019026
Did not meet early stopping. Best iteration is:
[1000]	training's binary_logloss: 0.0130175	valid_1's binary_logloss: 0.019026


2022-06-02 05:09:12,588 - INFO - Fold0:CV=(0.01902599166541387,)
2022-06-02 05:09:12,650 - INFO - [fold 0] done in 1022 s


Training until validation scores don't improve for 300 rounds
[500]	training's binary_logloss: 0.0148591	valid_1's binary_logloss: 0.0200669
[1000]	training's binary_logloss: 0.0137433	valid_1's binary_logloss: 0.0199221
Did not meet early stopping. Best iteration is:
[1000]	training's binary_logloss: 0.0137433	valid_1's binary_logloss: 0.0199221


2022-06-02 05:36:41,174 - INFO - Fold1:CV=(0.0199221076673244,)
2022-06-02 05:36:41,237 - INFO - [fold 1] done in 1649 s
2022-06-02 05:36:42,951 - INFO - [lightgbm] done in 2672 s


In [12]:
train["oof"] = y_oof

In [13]:
train["id"] = train["id"].astype('category')

In [14]:
train["oof_rank"] = train.groupby(by="id")["oof"].rank(ascending=False)

In [15]:
train_ = train[train["oof_rank"] <= 30].reset_index(drop=True)

In [16]:
train_.to_pickle(f"../output/exp/exp{exp}_first_stage30.pkl")