In [1]:
# =========================
# Library
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import seaborn as sns
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
from joblib import Parallel, delayed
import Levenshtein
import difflib
from contextlib import contextmanager
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import log_loss
import unicodedata
warnings.filterwarnings('ignore')
import sys
sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm
from util_tool import reduce_mem_usage
pd.set_option('display.max_columns', 300)

In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
fe = "066"
train_neighbor_path_place = f"../output/exp/exp037_first_stage30.pkl"
train_neighbor_path_name = f"../output/exp/exp061_first_stage30.pkl"

In [4]:
# =========================
# Functions
# =========================
def calc_distance_2nd_stage(c1,c2):
    c1 = text_preprocess(c1)
    c2 = text_preprocess(c2)
    if (str(c1) != "nan") and (str(c2) != "nan"):
        return np.array([difflib.SequenceMatcher(None, str(c1), str(c2)).ratio(),
                Levenshtein.distance(str(c1), str(c2)),
                Levenshtein.jaro_winkler(str(c1), str(c2))]).reshape(1,-1)
    else:
        return np.array([np.nan,np.nan,np.nan]).reshape(1,-1)
    
def text_preprocess(text):
    text = str(text)
    text = text.replace(" ","")
    text = text.lower()
    text = unicodedata.normalize("NFKC",text)
    return text

In [5]:
train_place = pd.read_pickle(train_neighbor_path_place)
train_name = pd.read_pickle(train_neighbor_path_name)

In [6]:
train_place["id_near_id"] = train_place["id"].astype(str) + "-" + train_place["near_id"].astype(str)
train_name["id_near_id"] = train_name["id"].astype(str) + "-" + train_name["near_id"].astype(str)

In [9]:
# 20以下
train_name = train_name[train_name["oof_rank"] <= 20].reset_index(drop=True)
train_place = train_place[train_place["oof_rank"] <= 20].reset_index(drop=True)
name_cols = ['id', 'name', 'latitude', 'longitude', 'point_of_interest',
       'set', 'rank', 'd_near', 'near_target', 'near_id',
       'near_name', 'near_latitude', 'near_longitude',
       'target']
place_cols = ['id', 'name', 'latitude', 'longitude', 'point_of_interest',
       'set','near_target', 'near_id',
       'near_name', 'near_latitude', 'near_longitude',
       'target']
train_concat = pd.concat([train_name[name_cols],
                          train_place[place_cols]]).reset_index(drop=True)
train_concat = train_concat.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)

In [10]:
del train_name,train_place
gc.collect()

43

In [13]:
train_raw = pd.read_csv(TRAIN_PATH)
use_cols = ["id","address","city","state","zip","country","url","phone",'categories']
train_raw = train_raw[use_cols]

In [14]:
train_raw["id"] = train_raw["id"].astype("category")

In [15]:
train_concat["near_id"] = train_concat["near_id"].astype("category")

In [16]:
train_concat.columns

Index(['id', 'name', 'latitude', 'longitude', 'point_of_interest', 'set',
       'rank', 'd_near', 'near_target', 'near_id', 'near_name',
       'near_latitude', 'near_longitude', 'target'],
      dtype='object')

In [17]:
train_concat = train_concat.merge(train_raw,how="left",on="id")
train_raw.columns = [f"near_{i}" for i in train_raw.columns]


In [18]:
train_concat = train_concat.merge(train_raw,how="left",on="near_id")

In [22]:
columns = ['name', 'address', 'city', 'state',
           'zip', 'url', 'phone', 'categories']
for c in columns:
    distance = Parallel(n_jobs=48, verbose = 1, backend = 'multiprocessing')\
    ([delayed(calc_distance_2nd_stage)(c1,c2) \
       for (c1,c2) in zip(train_concat[c].values, train_concat[f"near_{c}"].values)])
    distance = np.concatenate(distance)
    distance = pd.DataFrame(distance)
    distance.columns = [f"{c}_gesh",f"{c}_leven",f"{c}_jaro"]
    distance = reduce_mem_usage(distance)
    train_concat = pd.concat([train_concat,distance],axis=1)
    del distance
    gc.collect()

[Parallel(n_jobs=48)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 115 tasks      | elapsed:    0.2s
[Parallel(n_jobs=48)]: Done 1236 tasks      | elapsed:    0.3s
[Parallel(n_jobs=48)]: Done 16416 tasks      | elapsed:    0.7s
[Parallel(n_jobs=48)]: Done 197536 tasks      | elapsed:    4.3s
[Parallel(n_jobs=48)]: Done 479136 tasks      | elapsed:    9.5s
[Parallel(n_jobs=48)]: Done 811936 tasks      | elapsed:   15.7s
[Parallel(n_jobs=48)]: Done 1195936 tasks      | elapsed:   23.0s
[Parallel(n_jobs=48)]: Done 1631136 tasks      | elapsed:   31.0s
[Parallel(n_jobs=48)]: Done 2117536 tasks      | elapsed:   39.9s
[Parallel(n_jobs=48)]: Done 2655136 tasks      | elapsed:   49.5s
[Parallel(n_jobs=48)]: Done 3243936 tasks      | elapsed:  1.0min
[Parallel(n_jobs=48)]: Done 3883936 tasks      | elapsed:  1.2min
[Parallel(n_jobs=48)]: Done 4575136 tasks      | elapsed:  1.4min
[Parallel(n_jobs=48)]: Done 5317536 tasks      | elapsed:  1.6min


Memory usage of dataframe is 995.50 MB
column =  3
0
Memory usage after optimization is: 497.75 MB
Decreased by 50.0%


[Parallel(n_jobs=48)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    4.8s
[Parallel(n_jobs=48)]: Done 519 tasks      | elapsed:    4.8s
[Parallel(n_jobs=48)]: Done 5760 tasks      | elapsed:    5.0s
[Parallel(n_jobs=48)]: Done 123952 tasks      | elapsed:    7.3s
[Parallel(n_jobs=48)]: Done 405552 tasks      | elapsed:   12.4s
[Parallel(n_jobs=48)]: Done 738352 tasks      | elapsed:   18.2s
[Parallel(n_jobs=48)]: Done 1122352 tasks      | elapsed:   24.9s
[Parallel(n_jobs=48)]: Done 1557552 tasks      | elapsed:   32.5s
[Parallel(n_jobs=48)]: Done 2043952 tasks      | elapsed:   41.2s
[Parallel(n_jobs=48)]: Done 2581552 tasks      | elapsed:   51.4s
[Parallel(n_jobs=48)]: Done 3170352 tasks      | elapsed:  1.0min
[Parallel(n_jobs=48)]: Done 3810352 tasks      | elapsed:  1.2min
[Parallel(n_jobs=48)]: Done 4501552 tasks      | elapsed:  1.5min
[Parallel(n_jobs=48)]: Done 5243952 tasks      | elapsed:  1.7min
[P

Memory usage of dataframe is 995.50 MB
column =  3
0
Memory usage after optimization is: 497.75 MB
Decreased by 50.0%


[Parallel(n_jobs=48)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    4.7s
[Parallel(n_jobs=48)]: Done 510 tasks      | elapsed:    4.8s
[Parallel(n_jobs=48)]: Done 5808 tasks      | elapsed:    5.0s
[Parallel(n_jobs=48)]: Done 123952 tasks      | elapsed:    7.4s
[Parallel(n_jobs=48)]: Done 405552 tasks      | elapsed:   12.5s
[Parallel(n_jobs=48)]: Done 738352 tasks      | elapsed:   18.5s
[Parallel(n_jobs=48)]: Done 1122352 tasks      | elapsed:   25.5s
[Parallel(n_jobs=48)]: Done 1557552 tasks      | elapsed:   33.2s
[Parallel(n_jobs=48)]: Done 2043952 tasks      | elapsed:   42.1s
[Parallel(n_jobs=48)]: Done 2581552 tasks      | elapsed:   52.2s
[Parallel(n_jobs=48)]: Done 3170352 tasks      | elapsed:  1.1min
[Parallel(n_jobs=48)]: Done 3810352 tasks      | elapsed:  1.3min
[Parallel(n_jobs=48)]: Done 4501552 tasks      | elapsed:  1.5min
[Parallel(n_jobs=48)]: Done 5243952 tasks      | elapsed:  1.7min
[P

Memory usage of dataframe is 995.50 MB
column =  3
0
Memory usage after optimization is: 497.75 MB
Decreased by 50.0%


[Parallel(n_jobs=48)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    6.0s
[Parallel(n_jobs=48)]: Done 516 tasks      | elapsed:    6.1s
[Parallel(n_jobs=48)]: Done 5712 tasks      | elapsed:    6.3s
[Parallel(n_jobs=48)]: Done 92208 tasks      | elapsed:    9.4s
[Parallel(n_jobs=48)]: Done 233008 tasks      | elapsed:   13.5s
[Parallel(n_jobs=48)]: Done 397488 tasks      | elapsed:   19.8s
[Parallel(n_jobs=48)]: Done 585776 tasks      | elapsed:   31.7s
[Parallel(n_jobs=48)]: Done 803376 tasks      | elapsed:   38.4s
[Parallel(n_jobs=48)]: Done 1046576 tasks      | elapsed:   42.8s
[Parallel(n_jobs=48)]: Done 1315376 tasks      | elapsed:   47.7s
[Parallel(n_jobs=48)]: Done 1609776 tasks      | elapsed:   53.0s
[Parallel(n_jobs=48)]: Done 1929776 tasks      | elapsed:   58.8s
[Parallel(n_jobs=48)]: Done 2275376 tasks      | elapsed:  1.1min
[Parallel(n_jobs=48)]: Done 2646576 tasks      | elapsed:  1.2min
[Para

Memory usage of dataframe is 995.50 MB
column =  3
0
Memory usage after optimization is: 497.75 MB
Decreased by 50.0%


[Parallel(n_jobs=48)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    4.9s
[Parallel(n_jobs=48)]: Done 518 tasks      | elapsed:    4.9s
[Parallel(n_jobs=48)]: Done 5712 tasks      | elapsed:    5.1s
[Parallel(n_jobs=48)]: Done 123952 tasks      | elapsed:    7.4s
[Parallel(n_jobs=48)]: Done 405552 tasks      | elapsed:   12.3s
[Parallel(n_jobs=48)]: Done 738352 tasks      | elapsed:   18.1s
[Parallel(n_jobs=48)]: Done 1122352 tasks      | elapsed:   24.8s
[Parallel(n_jobs=48)]: Done 1557552 tasks      | elapsed:   32.5s
[Parallel(n_jobs=48)]: Done 2043952 tasks      | elapsed:   40.9s
[Parallel(n_jobs=48)]: Done 2581552 tasks      | elapsed:   50.2s
[Parallel(n_jobs=48)]: Done 3170352 tasks      | elapsed:  1.0min
[Parallel(n_jobs=48)]: Done 3810352 tasks      | elapsed:  1.2min
[Parallel(n_jobs=48)]: Done 4501552 tasks      | elapsed:  1.4min
[Parallel(n_jobs=48)]: Done 5243952 tasks      | elapsed:  1.6min
[P

Memory usage of dataframe is 995.50 MB
column =  3
0
Memory usage after optimization is: 497.75 MB
Decreased by 50.0%


[Parallel(n_jobs=48)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    4.7s
[Parallel(n_jobs=48)]: Done 525 tasks      | elapsed:    4.7s
[Parallel(n_jobs=48)]: Done 5744 tasks      | elapsed:    5.0s
[Parallel(n_jobs=48)]: Done 86576 tasks      | elapsed:    6.5s
[Parallel(n_jobs=48)]: Done 227376 tasks      | elapsed:    9.1s
[Parallel(n_jobs=48)]: Done 393776 tasks      | elapsed:   12.1s
[Parallel(n_jobs=48)]: Done 585776 tasks      | elapsed:   15.6s
[Parallel(n_jobs=48)]: Done 803376 tasks      | elapsed:   19.4s
[Parallel(n_jobs=48)]: Done 1046576 tasks      | elapsed:   23.8s
[Parallel(n_jobs=48)]: Done 1315376 tasks      | elapsed:   28.6s
[Parallel(n_jobs=48)]: Done 1609776 tasks      | elapsed:   33.9s
[Parallel(n_jobs=48)]: Done 1929776 tasks      | elapsed:   39.7s
[Parallel(n_jobs=48)]: Done 2275376 tasks      | elapsed:   46.0s
[Parallel(n_jobs=48)]: Done 2646576 tasks      | elapsed:   52.7s
[Para

Memory usage of dataframe is 995.50 MB
column =  3
0
Memory usage after optimization is: 497.75 MB
Decreased by 50.0%


[Parallel(n_jobs=48)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    4.9s
[Parallel(n_jobs=48)]: Done 518 tasks      | elapsed:    5.0s
[Parallel(n_jobs=48)]: Done 5680 tasks      | elapsed:    5.2s
[Parallel(n_jobs=48)]: Done 22432 tasks      | elapsed:   14.3s
[Parallel(n_jobs=48)]: Done 135600 tasks      | elapsed:   17.0s
[Parallel(n_jobs=48)]: Done 467632 tasks      | elapsed:   22.9s
[Parallel(n_jobs=48)]: Done 851632 tasks      | elapsed:   33.0s
[Parallel(n_jobs=48)]: Done 920016 tasks      | elapsed:   37.5s
[Parallel(n_jobs=48)]: Done 1026416 tasks      | elapsed:   39.6s
[Parallel(n_jobs=48)]: Done 1176496 tasks      | elapsed:   42.4s
[Parallel(n_jobs=48)]: Done 1434096 tasks      | elapsed:   47.1s
[Parallel(n_jobs=48)]: Done 1714096 tasks      | elapsed:   52.1s
[Parallel(n_jobs=48)]: Done 2016496 tasks      | elapsed:   57.6s
[Parallel(n_jobs=48)]: Done 2341296 tasks      | elapsed:  1.1min
[Para

Memory usage of dataframe is 995.50 MB
column =  3
0
Memory usage after optimization is: 497.75 MB
Decreased by 50.0%


[Parallel(n_jobs=48)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    4.8s
[Parallel(n_jobs=48)]: Done 424 tasks      | elapsed:    4.9s
[Parallel(n_jobs=48)]: Done 4208 tasks      | elapsed:    5.1s
[Parallel(n_jobs=48)]: Done 99424 tasks      | elapsed:    7.1s
[Parallel(n_jobs=48)]: Done 381024 tasks      | elapsed:   12.7s
[Parallel(n_jobs=48)]: Done 713824 tasks      | elapsed:   19.3s
[Parallel(n_jobs=48)]: Done 1097824 tasks      | elapsed:   26.7s
[Parallel(n_jobs=48)]: Done 1533024 tasks      | elapsed:   34.9s
[Parallel(n_jobs=48)]: Done 2019424 tasks      | elapsed:   44.2s
[Parallel(n_jobs=48)]: Done 2557024 tasks      | elapsed:   54.6s
[Parallel(n_jobs=48)]: Done 3145824 tasks      | elapsed:  1.1min
[Parallel(n_jobs=48)]: Done 3785824 tasks      | elapsed:  1.3min
[Parallel(n_jobs=48)]: Done 4477024 tasks      | elapsed:  1.5min
[Parallel(n_jobs=48)]: Done 5219424 tasks      | elapsed:  1.8min
[Pa

Memory usage of dataframe is 995.50 MB
column =  3
0
Memory usage after optimization is: 497.75 MB
Decreased by 50.0%


In [23]:
train_concat["distance"] = (train_concat["latitude"] - train_concat["near_latitude"])**2 \
                    + (train_concat["longitude"] - train_concat["near_longitude"])**2
train_concat["distance_rank"] = train_concat.groupby(by="id")["distance"].rank()

In [24]:
train_concat = reduce_mem_usage(train_concat)
train_concat.to_pickle(f"../output/fe/fe{fe}.pkl")

Memory usage of dataframe is 13771.12 MB
column =  56
0
50
Memory usage after optimization is: 13148.94 MB
Decreased by 4.5%
