**Feature engineering for level 2 models**

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

# **Librairies**

In [None]:
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pandarallel import pandarallel

from matching import load_cleaned_data
# from ressources import *
from fe import FE2
from dtypes import DTYPES_1, reduce_mem_usage
from fe_theo import feature_engineering_theo_2
from params import DEBUG, OUT_PATH, IS_TEST, RESSOURCES_PATH

pandarallel.initialize(nb_workers=12, progress_bar=False)
warnings.simplefilter("ignore")
pd.options.display.max_columns = 500

## Load Data

In [None]:
if IS_TEST:
    train = load_cleaned_data(OUT_PATH + "cleaned_data_test.csv")
else:
    train = load_cleaned_data(OUT_PATH + "cleaned_data_train.csv")

In [None]:
train["idx"] = np.arange(len(train))

In [None]:
THRESHOLD = 0.0075

In [None]:
if IS_TEST:
    df_p = pd.read_csv(
        OUT_PATH + f"features_test_1_filtered_{THRESHOLD}.csv", dtype=DTYPES_1
    )
    SIZE_RATIO = 1
else:
    df_p = pd.read_csv(
        OUT_PATH + f"features_train_1_filtered_{THRESHOLD}.csv", dtype=DTYPES_1
    )
    SIZE_RATIO = 1

In [None]:
if DEBUG:
    df_p = df_p.head(10000).copy()

### Youri & Vincent

In [None]:
df_p = df_p.merge(train[["id", "Nb_multiPoi"]], left_on="id_1", right_on="id").drop(
    "id", axis=1
)
df_p = df_p.merge(
    train[["id", "Nb_multiPoi"]], left_on="id_2", right_on="id", suffixes=("_1", "_2")
).drop("id", axis=1)

In [None]:
p1 = df_p[["id_1"]].copy()
p1.columns = ["id"]
p2 = df_p[["id_2"]].copy()
p2.columns = ["id"]

In [None]:
%%time
df = FE2(df_p.copy(), p1, p2, train, RESSOURCES_PATH, size_ratio=SIZE_RATIO)

### Théo

In [None]:
cols = [
    "id",
    "name",
    "address",
    "country",
    "url",
    "phone",
    "city",
    "state",
    "zip",
    "idx",
]

for col in cols[1:]:
    train.loc[train[col] == "", col] = np.nan

p1 = p1[["id"]].merge(train[cols], on="id", how="left")
p2 = p2[["id"]].merge(train[cols], on="id", how="left")

pairs = pd.concat([p1[cols], p2[cols]], axis=1)
pairs.columns = [c + "_1" for c in cols] + [c + "_2" for c in cols]

In [None]:
%%time

df_theo, fts_theo = feature_engineering_theo_2(train.copy(), pairs.copy(), cuda=False)

## Merge & Save

In [None]:
df_merged = df.merge(df_theo, on=["id_1", "id_2"])

In [None]:
cols_to_end = [
    "point_of_interest_1",
    "fold_1",
    "point_of_interest_2",
    "fold_2",
    "match",
]
cols_to_end = [c for c in cols_to_end if c in df_merged.columns]

if len(cols_to_end):
    to_end = df_merged[cols_to_end]
    df_merged.drop(cols_to_end, axis=1, inplace=True)
    df_merged[cols_to_end] = to_end

In [None]:
if DEBUG:
    corrs = df_merged.corr()
    for col in corrs.columns:
        close = pd.DataFrame(corrs.loc[corrs[col] > 0.99][col])
        if len(close) > 1:
            display(close[close[col] < 1])

In [None]:
df_merged = reduce_mem_usage(df_merged)

In [None]:
if not DEBUG:
    if IS_TEST:
        df_merged.to_csv(OUT_PATH + f"features_test_2_{THRESHOLD}.csv", index=False)
    else:
        df_merged.to_csv(OUT_PATH + f"features_train_2_{THRESHOLD}.csv", index=False)

In [None]:
df_merged.head()

Done !