In [1]:
# =========================
# Library
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import seaborn as sns
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
from joblib import Parallel, delayed
import Levenshtein
import difflib
from contextlib import contextmanager
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import log_loss
from strsimpy import SIFT4
from strsimpy.ngram import NGram
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')
import sys
sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm
from util_tool import reduce_mem_usage
pd.set_option('display.max_columns', 300)

In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
fe = "067"

In [10]:
# ==============================
# Main
# ==============================
train = pd.read_pickle("../output/fe/fe066.pkl") 

In [13]:
train["id"] = train["id"].astype("category")
train["near_id"] = train["near_id"].astype("category")

In [15]:
for c in tqdm(["name","categories"]):
    for d in ["gesh","leven","jaro"]:
        if d == "leven":
            tmp_mean = train.groupby(by="id")[f"{c}_{d}"].mean().to_dict()
            tmp_min = train.groupby(by="id")[f"{c}_{d}"].min().to_dict()
            train[f"{c}_{d}_mean"] = train["id"].map(tmp_mean)
            train[f"{c}_{d}_min"] = train["id"].map(tmp_min)
            train[f"near_{c}_{d}_mean"] = train["near_id"].map(tmp_mean)
            train[f"near_{c}_{d}_min"] = train["near_id"].map(tmp_min)
            train[f"{c}_{d}_mean_rate"] = train[f"{c}_{d}"] / train[f"{c}_{d}_mean"]
            train[f"{c}_{d}_min_rate"] = train[f"{c}_{d}"] / train[f"{c}_{d}_min"]
            train[f"near_{c}_{d}_mean_rate"] = train[f"{c}_{d}"] / train[f"near_{c}_{d}_mean"]
            train[f"near_{c}_{d}_min_rate"] = train[f"{c}_{d}"] / train[f"near_{c}_{d}_min"]
        else:
            tmp_mean = train.groupby(by="id")[f"{c}_{d}"].mean().to_dict()
            tmp_max = train.groupby(by="id")[f"{c}_{d}"].max().to_dict()
            train[f"{c}_{d}_mean"] = train["id"].map(tmp_mean)
            train[f"{c}_{d}_max"] = train["id"].map(tmp_max)
            train[f"near_{c}_{d}_mean"] = train["near_id"].map(tmp_mean)
            train[f"near_{c}_{d}_max"] = train["near_id"].map(tmp_max)
            train[f"{c}_{d}_mean_rate"] = train[f"{c}_{d}"] / train[f"{c}_{d}_mean"]
            train[f"{c}_{d}_max_rate"] = train[f"{c}_{d}"] / train[f"{c}_{d}_max"]
            train[f"near_{c}_{d}_mean_rate"] = train[f"{c}_{d}"] / train[f"near_{c}_{d}_mean"]
            train[f"near_{c}_{d}_max_rate"] = train[f"{c}_{d}"] / train[f"near_{c}_{d}_max"]
            
for c in ["d_near","distance"]:
    tmp_mean = train.groupby(by="id")[c].mean().to_dict()
    tmp_min = train.groupby(by="id")[c].min().to_dict()
    train[f"{c}_mean"] = train["id"].map(tmp_mean)
    train[f"{c}_min"] = train["id"].map(tmp_min)
    train[f"near_{c}_mean"] = train["near_id"].map(tmp_mean)
    train[f"near_{c}_min"] = train["near_id"].map(tmp_min)
    
    train[f"{c}_mean_rate"] = train[f"{c}"] / train[f"{c}_mean"]
    train[f"{c}_min_rate"] = train[f"{c}"] / train[f"{c}_min"]
    train[f"near_{c}_mean_rate"] = train[f"{c}"] / train[f"near_{c}_mean"]
    train[f"near_{c}_min_rate"] = train[f"{c}"] / train[f"near_{c}_min"]

  0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
new_cols = [ 'name_gesh_mean', 'name_gesh_max', 'near_name_gesh_mean',
       'near_name_gesh_max', 'name_gesh_mean_rate', 'name_gesh_max_rate',
       'near_name_gesh_mean_rate', 'near_name_gesh_max_rate',
       'name_leven_mean', 'name_leven_min', 'near_name_leven_mean',
       'near_name_leven_min', 'name_leven_mean_rate', 'name_leven_min_rate',
       'near_name_leven_mean_rate', 'near_name_leven_min_rate',
       'name_jaro_mean', 'name_jaro_max', 'near_name_jaro_mean',
       'near_name_jaro_max', 'name_jaro_mean_rate', 'name_jaro_max_rate',
       'near_name_jaro_mean_rate', 'near_name_jaro_max_rate',
       'categories_gesh_mean', 'categories_gesh_max',
       'near_categories_gesh_mean', 'near_categories_gesh_max',
       'categories_gesh_mean_rate', 'categories_gesh_max_rate',
       'near_categories_gesh_mean_rate', 'near_categories_gesh_max_rate',
       'categories_leven_mean', 'categories_leven_min',
       'near_categories_leven_mean', 'near_categories_leven_min',
       'categories_leven_mean_rate', 'categories_leven_min_rate',
       'near_categories_leven_mean_rate', 'near_categories_leven_min_rate',
       'categories_jaro_mean', 'categories_jaro_max',
       'near_categories_jaro_mean', 'near_categories_jaro_max',
       'categories_jaro_mean_rate', 'categories_jaro_max_rate',
       'near_categories_jaro_mean_rate', 'near_categories_jaro_max_rate',
       'd_near_mean', 'd_near_min', 'near_d_near_mean', 'near_d_near_min',
       'd_near_mean_rate', 'd_near_min_rate', 'near_d_near_mean_rate',
       'near_d_near_min_rate','distance_mean',
       'distance_min', 'near_distance_mean', 'near_distance_min',
       'distance_mean_rate', 'distance_min_rate', 'near_distance_mean_rate',
       'near_distance_min_rate']

In [17]:
train = reduce_mem_usage(train)
train[new_cols].to_pickle(f"../output/fe/fe{fe}.pkl")

Memory usage of dataframe is 34136.37 MB
column =  120
0
50
100
Memory usage after optimization is: 23517.68 MB
Decreased by 31.1%
