**TODO :**
- pandarallel to speed up ?

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src

/home/theo/kaggle/foursquare/src


In [3]:
%load_ext nb_black

<IPython.core.display.Javascript object>

# **Librairies**

In [4]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch

torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2080 Ti'

<IPython.core.display.Javascript object>

In [5]:
import gc
import cudf
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pandarallel import pandarallel

from params import DEBUG, OUT_PATH, IS_TEST
from data.preparation import reduce_mem_usage
from ressources import *
from matching import *

random.seed(13)
warnings.simplefilter("ignore")
pd.options.display.max_columns = 500
pandarallel.initialize(nb_workers=12, progress_bar=False)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


<IPython.core.display.Javascript object>

## Load Data

In [None]:
if IS_TEST:
    train = load_cleaned_data(OUT_PATH + "cleaned_data_test.csv")
    p1 = pd.read_csv(OUT_PATH + "p1_yv_test.csv")
    p2 = pd.read_csv(OUT_PATH + "p2_yv_test.csv")

    SIZE_RATIO = 1
else:
    train = load_cleaned_data(OUT_PATH + "cleaned_data_train.csv")
    p1 = pd.read_csv(OUT_PATH + "p1_yv_train.csv")
    p2 = pd.read_csv(OUT_PATH + "p2_yv_train.csv")

    SIZE_RATIO = 1

In [None]:
print_infos(p1, p2, 713788)

In [None]:
get_CV(
    p1,
    p2,
    np.array(p1["point_of_interest"] == p2["point_of_interest"]).astype(np.int8),
    np.array(p1["point_of_interest"] == p2["point_of_interest"]).astype(np.int8),
    train,
)

In [None]:
if DEBUG:
    p1 = p1.head(100000).copy()
    p2 = p2.head(100000).copy()

### Prepare

In [None]:
train["idx"] = np.arange(len(train))

In [None]:
# add other columns - needed for FE
cols = [
    "id",
    "name",
    "latitude",
    "longitude",
    "address",
    "country",
    "url",
    "phone",
    "city",
    "categories",
    "category_simpl",
    "categories_split",
    "cat2",
    "idx",
    "state",
    "zip",
]
p1 = p1[["id"]].merge(train[cols], on="id", how="left")
p2 = p2[["id"]].merge(train[cols], on="id", how="left")

In [None]:
# check for flipped sign on longitude - this may help test data a lot; test it? Move this code up to apply to "train"
dist = distance(
    np.array(p1["latitude"]),
    np.array(p1["longitude"]),
    np.array(p2["latitude"]),
    np.array(p2["longitude"]),
)
df = pd.DataFrame(dist)
df.columns = ["dist"]
df["dist"] = df["dist"].astype("int32")
df["dist1"] = (111173.444444444 * np.abs(p1["latitude"] - p2["latitude"])).astype(
    "int32"
)
df["dist2"] = np.sqrt(np.maximum(0, (1.0 * df["dist"]) ** 2 - df["dist1"] ** 2)).astype(
    "int32"
)
idx = (
    (df["dist1"] < 10000)
    & (df["dist2"] > 1000000)
    & (np.abs(p1["longitude"] + p2["longitude"]) < 0.1)
) & (p1["country"] == p2["country"])
# this selects only 3 cases in train data, but possibly more in test, so keep it becasue it is basically free
print("flipped sign of longitude for", idx.sum(), "points")
p1["longitude"].loc[idx] *= -1  # flip(correct) sign
del df, idx, dist
gc.collect()

## Batched PP

In [None]:
# N_FOLDS = 2
# path = f"../output/folds_{N_FOLDS}.csv"

# if os.path.exists(path):
#     df_split = pd.read_csv(path)
# else:
#     from sklearn.model_selection import GroupKFold

#     gkf = GroupKFold(n_splits=N_FOLDS)
#     splits = list(gkf.split(train["id"], groups=train["point_of_interest"]))

#     df_split = train[["id", "point_of_interest"]].copy()
#     df_split["batch"] = -1

#     for i, (_, val_idx) in enumerate(splits):
#         df_split.loc[val_idx, "batch"] = i

#     df_split.to_csv(path, index=False)

In [None]:
# train = train.merge(df_split, how="left")

In [None]:
# df_merged = feature_engineering_1(p1, p2, train, ressources_path=RESSOURCES_PATH)

### Youri & Vincent

In [None]:
from fe import FE1

In [None]:
%%time
df = FE1(p1, p2)

In [None]:
df.insert(0, "id_1", p1["id"].values)
df.insert(1, "id_2", p2["id"].values)

In [None]:
df.head()

### Théo

In [None]:
from fe_theo import feature_engineering_theo

In [None]:
cols = [
    "id",
    "name",
    "latitude",
    "longitude",
    "address",
    "country",
    "url",
    "phone",
    "city",
    "state",
    "zip",
    "categories",
    "idx",
]
pairs = pd.concat([p1[cols], p2[cols]], axis=1)
pairs.columns = [c + "_1" for c in cols] + [c + "_2" for c in cols]

In [None]:
%%time

df_theo, fts_theo = feature_engineering_theo(train.copy(), pairs)

## Merge & Save

In [None]:
df_merged = df.merge(df_theo, on=["id_1", "id_2"])

In [None]:
df_merged = reduce_mem_usage(df_merged)

In [None]:
if DEBUG:
    corrs = df_merged.corr()
    for col in corrs.columns:
        close = pd.DataFrame(corrs.loc[corrs[col] > 0.99][col])
        if len(close) > 1:
            display(close[close[col] < 1])

In [None]:
if not DEBUG:
    if IS_TEST:
        df_merged.to_csv(OUT_PATH + "features_test_1.csv", index=False)
    else:
        df_merged.to_csv(OUT_PATH + "features_train_1.csv", index=False)