**TODO :**
- pandarallel to speed up ?

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src

/home/theo/kaggle/foursquare/src


In [3]:
%load_ext nb_black

<IPython.core.display.Javascript object>

# **Librairies**

In [4]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch

torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2080 Ti'

<IPython.core.display.Javascript object>

In [5]:
import gc
import cudf
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pandarallel import pandarallel

from params import DEBUG, OUT_PATH, IS_TEST
from ressources import *
from matching import *

random.seed(13)
warnings.simplefilter("ignore")
pd.options.display.max_columns = 500
pandarallel.initialize(nb_workers=12, progress_bar=False)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


<IPython.core.display.Javascript object>

## Load Data

In [6]:
if IS_TEST:
    train = load_cleaned_data(OUT_PATH + "cleaned_data_test.csv")
    p1 = pd.read_csv(OUT_PATH + "p1_yv_test.csv")
    p2 = pd.read_csv(OUT_PATH + "p2_yv_test.csv")

    SIZE_RATIO = 1
else:
    train = load_cleaned_data(OUT_PATH + "cleaned_data_train.csv")
    p1 = pd.read_csv(OUT_PATH + "p1_yv_train.csv")
    p2 = pd.read_csv(OUT_PATH + "p2_yv_train.csv")

    SIZE_RATIO = 1

<IPython.core.display.Javascript object>

In [7]:
print_infos(p1, p2, 713788)


Number of candidates : 28.3M
Proportion of positive candidates: 2.50%
Proportion of found matches: 97.85%



<IPython.core.display.Javascript object>

In [8]:
get_CV(
    p1,
    p2,
    np.array(p1["point_of_interest"] == p2["point_of_interest"]).astype(np.int8),
    np.array(p1["point_of_interest"] == p2["point_of_interest"]).astype(np.int8),
    train,
)

- Highest reachable IoU : 0.9898


<IPython.core.display.Javascript object>

In [9]:
DEBUG = True

<IPython.core.display.Javascript object>

In [10]:
if DEBUG:
    p1 = p1.head(1000000).copy()
    p2 = p2.head(1000000).copy()

<IPython.core.display.Javascript object>

### Prepare

In [11]:
train["idx"] = np.arange(len(train))

<IPython.core.display.Javascript object>

In [12]:
# add other columns - needed for FE
cols = [
    "id",
    "name",
    "latitude",
    "longitude",
    "address",
    "country",
    "url",
    "phone",
    "city",
    "categories",
    "category_simpl",
    "categories_split",
    "cat2",
    "idx",
    "state",
    "zip",
]
p1 = p1[["id"]].merge(train[cols], on="id", how="left")
p2 = p2[["id"]].merge(train[cols], on="id", how="left")

<IPython.core.display.Javascript object>

In [13]:
# check for flipped sign on longitude - this may help test data a lot; test it? Move this code up to apply to "train"
dist = distance(
    np.array(p1["latitude"]),
    np.array(p1["longitude"]),
    np.array(p2["latitude"]),
    np.array(p2["longitude"]),
)
df = pd.DataFrame(dist)
df.columns = ["dist"]
df["dist"] = df["dist"].astype("int32")
df["dist1"] = (111173.444444444 * np.abs(p1["latitude"] - p2["latitude"])).astype(
    "int32"
)
df["dist2"] = np.sqrt(np.maximum(0, (1.0 * df["dist"]) ** 2 - df["dist1"] ** 2)).astype(
    "int32"
)
idx = (
    (df["dist1"] < 10000)
    & (df["dist2"] > 1000000)
    & (np.abs(p1["longitude"] + p2["longitude"]) < 0.1)
) & (p1["country"] == p2["country"])
# this selects only 3 cases in train data, but possibly more in test, so keep it becasue it is basically free
print("flipped sign of longitude for", idx.sum(), "points")
p1["longitude"].loc[idx] *= -1  # flip(correct) sign
del df, idx, dist
gc.collect()

flipped sign of longitude for 0 points


593

<IPython.core.display.Javascript object>

## Batched PP

In [21]:
N_FOLDS = 2
path = f"../output/folds_{N_FOLDS}.csv"

if os.path.exists(path):
    df_split = pd.read_csv(path)
else:
    from sklearn.model_selection import GroupKFold

    gkf = GroupKFold(n_splits=N_FOLDS)
    splits = list(gkf.split(train["id"], groups=train["point_of_interest"]))

    df_split = train[["id", "point_of_interest"]].copy()
    df_split["batch"] = -1

    for i, (_, val_idx) in enumerate(splits):
        df_split.loc[val_idx, "batch"] = i

    df_split.to_csv(path, index=False)

<IPython.core.display.Javascript object>

In [24]:
train = train.merge(df_split, how="left")

<IPython.core.display.Javascript object>

In [19]:
train

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest,lang,m_true,category_simpl,categories_split,name_initial,name_initial_decode,freq_pairing_with_other_groupedcat,cat_solo_score,Nb_multiPoi,mean,q25,q50,q75,q90,q99,nameC,lat2,lon2,name2,city_group,state_group,cat2,idx,fold
0,E_000001272c6c5d,cafestadoudenaarde,50.859982,3.634200,abdijstraat,nederename,oostvlaanderen,9700,11,,,bars,299028,2,E_000001272c6c5d E_da7fa3963561f8,19,[bars],café stad oudenaarde,cafe stad oudenaarde,0.316487,0.817467,24757.0,6.289269,0.015224,0.052231,0.527868,3.680882,92.168289,cso,51.0,4.0,cafesta,nederename,oostvlaanderen,1,0,1
1,E_000002eae2a589,cariocamanero,-22.907221,-43.178242,,,,,8,,,brazilianrestaurants,625009,1,E_000002eae2a589 E_e80db432029aea,0,[brazilianrestaurants],carioca manero,carioca manero,0.229345,0.694136,1310.0,2.960897,0.015873,0.046639,0.235518,1.240139,85.362183,cm,-23.0,-43.0,carioca,,,1,1,0
2,E_000007f24ebc95,raantadphmkaaraaekd,13.780810,100.484901,,,,,5,,,salonsbarbershops,511639,3,E_000007f24ebc95,22,[salonsbarbershops],ร้านตัดผมการาเกด,raantadphmkaaraaekd,0.146376,0.609556,13664.0,7.254750,0.019912,0.068463,0.362358,1.446149,30.401031,r,14.0,100.0,raantad,,,2,2,0
3,E_000008a8ba4f48,turkcell,37.844509,27.844200,adnanmenderesbulvari,,,,3,,,mobilephoneshops,517083,1,E_000008a8ba4f48,27,[mobilephoneshops],turkcell,turkcell,0.094117,0.747120,11671.0,6.907260,0.016782,0.073669,0.453414,2.079563,22.541948,t,38.0,28.0,turkcel,,,3,3,0
4,E_00001d92066153,restaurantecasacofino,43.338200,-4.326820,,caviedes,cantabria,,17,,,spanishrestaurants,371830,2,E_00001d92066153 E_7e0d8e9138dd56,0,[spanishrestaurants],restaurante casa cofiño,restaurante casa cofino,0.229345,0.720403,804.0,5.243670,0.014504,0.035658,0.201226,1.051524,11.854078,rcc,43.0,-4.0,restaur,caviedes,cantabria,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138807,E_ffffb80854f713,aogaeru,35.659019,139.700775,,shibuyaku,toukyouto,1500043,4,,,,360941,3,E_008558a7c44ed5 E_a05914fd828a1c E_f5d055893e...,-1,[],aogaeru,aogaeru,0.604150,0.701242,309016.0,228.585693,0.392952,3.211279,14.121393,374.942139,7377.770996,a,36.0,140.0,aogaeru,shibuyaward,toukyouto,0,1138807,1
1138808,E_ffffbf9a83e0ba,deshonplace,40.872120,-79.945343,325newcastlerd,butler,pa,16001,1,,,housingdevelopments,633273,1,E_37cbd58e31092a E_ffffbf9a83e0ba,0,[housingdevelopments],deshon place,deshon place,0.229345,0.619592,12745.0,3.996108,0.067265,0.221548,0.770412,3.244933,48.482964,dp,41.0,-80.0,deshonp,butler,pa,4,1138808,1
1138809,E_ffffc572b4d35b,izmiradnanmenderesairport,38.423729,27.142820,,izmir,,,3,,,airportservices,504544,3,E_00911dfe3e73c9 E_04074a25e0158f E_140d67717e...,1,[airportservices],i̇zmir adnan menderes havaalanı,izmir adnan menderes havaalani,0.386618,0.708057,10673.0,156.338333,0.816665,3.600699,27.473101,394.335480,2556.392578,iamh,38.0,27.0,izmirad,izmir,,15,1138809,0
1138810,E_ffffca745329ed,yakinikuwaie,35.710709,139.774994,ueno6136,taito,toukyouto,1100005,4,,338362989,bbqjoints,58950,3,E_04988888cfff60 E_ffffca745329ed,0,[bbqjoints],yakiniku waie,yakiniku waie,0.229345,0.837532,8588.0,4.759137,0.017242,0.065981,0.422791,2.072064,24.417580,yw,36.0,140.0,yakinik,taitodistrict,toukyouto,1,1138810,0


<IPython.core.display.Javascript object>

In [None]:
df_merged = feature_engineering_1(p1, p2, train, ressources_path=RESSOURCES_PATH)

### Youri & Vincent

In [12]:
from fe import FE1

<IPython.core.display.Javascript object>

In [13]:
%%time
df = FE1(p1, p2)

- Distances
- Features for column : name
- Features for column : categories
- Features for column : address
- Nan features
- Matching
- Category match
- Ratios
- Count encodings
CPU times: user 13min 59s, sys: 14.9 s, total: 14min 14s
Wall time: 14min 12s


<IPython.core.display.Javascript object>

In [14]:
df.insert(0, "id_1", p1["id"].values)
df.insert(1, "id_2", p2["id"].values)

<IPython.core.display.Javascript object>

In [15]:
df.head()

Unnamed: 0,id_1,id_2,dist,dist1,dist2,country,cat2a,cat2b,name_pi1,name_lcs2,name_lcs,name_pi1_r1,name_lcs2_r1,name_lcs2_r2,name_lcs_r1,name_lcs_r2,name_r3,name_lcs_r4,categories_pi1,categories_lcs2,categories_lcs,categories_pi1_r1,categories_lcs2_r1,categories_lcs2_r2,categories_lcs_r1,categories_lcs_r2,categories_r3,categories_lcs_r4,address_pi1,address_lcs2,address_lcs,address_pi1_r1,address_lcs2_r1,address_lcs2_r2,address_lcs_r1,address_lcs_r2,address_r3,address_lcs_r4,city_NA,address_NA,phone_m10,url_m5,same_cat_simpl,dist_r1,dist_r2,id_cc_min,id_cc_max,name_cc_min,name_cc_max
0,E_000001272c6c5d,E_00f3e03e4a63da,36320,4910,36320,11,1,1,4,4,8,0.31,0.31,0.22,0.62,0.44,0.72,0.5,4,4,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,2,3,0.09,0.18,0.12,0.27,0.18,0.65,0.67,0,0,0,0,1,600.845032,800.0,54,80,80,109
1,E_02814090c4be3c,E_000001272c6c5d,18030,12090,13360,11,1,1,4,4,8,0.27,0.27,0.22,0.53,0.44,0.83,0.5,4,4,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0,1,0,0,1,297.867401,444.857758,73,80,73,80
2,E_000001272c6c5d,E_051d16ff51c824,32860,2700,32860,11,1,1,0,1,2,0.0,0.1,0.06,0.2,0.11,0.56,0.5,0,1,3,0.0,0.25,0.04,0.75,0.12,0.16,0.33,11,11,11,1.0,1.0,0.5,1.0,0.5,0.5,1.0,0,0,0,0,0,543.571899,243.691925,36,80,66,80
3,E_0d580c719e7726,E_000001272c6c5d,18030,5430,18030,11,1,1,4,4,11,0.29,0.29,0.22,0.79,0.61,0.78,0.36,4,4,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.09,0.0,1,1,0,0,1,297.867401,444.857758,80,109,80,109
4,E_0e7fa023e1b41f,E_000001272c6c5d,18030,4020,18030,11,1,1,4,4,5,0.44,0.44,0.22,0.56,0.28,0.5,0.8,4,4,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,1,3,0.0,0.09,0.06,0.27,0.18,0.65,0.33,0,0,0,0,1,297.867401,444.857758,66,80,80,133


<IPython.core.display.Javascript object>

### Théo

In [16]:
from fe_theo import feature_engineering_theo

<IPython.core.display.Javascript object>

In [17]:
cols = [
    "id",
    "name",
    "latitude",
    "longitude",
    "address",
    "country",
    "url",
    "phone",
    "city",
    "state",
    "zip",
    "categories",
    "idx",
]
pairs = pd.concat([p1[cols], p2[cols]], axis=1)
pairs.columns = [c + "_1" for c in cols] + [c + "_2" for c in cols]

<IPython.core.display.Javascript object>

In [21]:
%%time

df_theo, fts_theo = feature_engineering_theo(train.copy(), pairs)

- Computing position distances
- Computing feature same_state
- Computing feature same_zip
- Computing feature same_city
- Column : name  -  Function : levenshtein
- Column : address  -  Function : levenshtein
- Column : url  -  Function : levenshtein
CPU times: user 1min 29s, sys: 29 s, total: 1min 58s
Wall time: 3min 53s


<IPython.core.display.Javascript object>

## Merge & Save

In [22]:
df_merged = df.merge(df_theo, on=["id_1", "id_2"])

<IPython.core.display.Javascript object>

In [23]:
df_merged.head()

Unnamed: 0,id_1,id_2,dist,dist1,dist2,country,cat2a,cat2b,name_pi1,name_lcs2,name_lcs,name_pi1_r1,name_lcs2_r1,name_lcs2_r2,name_lcs_r1,name_lcs_r2,name_r3,name_lcs_r4,categories_pi1,categories_lcs2,categories_lcs,categories_pi1_r1,categories_lcs2_r1,categories_lcs2_r2,categories_lcs_r1,categories_lcs_r2,categories_r3,categories_lcs_r4,address_pi1,address_lcs2,address_lcs,address_pi1_r1,address_lcs2_r1,address_lcs2_r2,address_lcs_r1,address_lcs_r2,address_r3,address_lcs_r4,city_NA,address_NA,phone_m10,url_m5,same_cat_simpl,dist_r1,dist_r2,id_cc_min,id_cc_max,name_cc_min,name_cc_max,angular_distance_min,angular_distance_l2_min,same_state,same_zip,same_city,name_len_diff,name_levenshtein,address_len_diff,address_levenshtein,url_len_diff,url_levenshtein
0,E_000001272c6c5d,E_00f3e03e4a63da,36320,4910,36320,11,1,1,4,4,8,0.31,0.31,0.22,0.62,0.44,0.72,0.5,4,4,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,2,3,0.09,0.18,0.12,0.27,0.18,0.65,0.67,0,0,0,0,1,600.845032,800.0,54,80,80,109,0.935697,0.865765,,,0.0,5,0.611111,6,0.882353,0,
1,E_02814090c4be3c,E_000001272c6c5d,18030,12090,13360,11,1,1,4,4,8,0.27,0.27,0.22,0.53,0.44,0.83,0.5,4,4,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0,1,0,0,1,297.867401,444.857758,73,80,73,80,0.499047,0.36399,,0.0,0.0,3,0.611111,11,,0,
2,E_000001272c6c5d,E_051d16ff51c824,32860,2700,32860,11,1,1,0,1,2,0.0,0.1,0.06,0.2,0.11,0.56,0.5,0,1,3,0.0,0.25,0.04,0.75,0.12,0.16,0.33,11,11,11,1.0,1.0,0.5,1.0,0.5,0.5,1.0,0,0,0,0,0,543.571899,243.691925,36,80,66,80,0.77617,0.736614,,0.0,0.0,8,0.888889,11,0.5,0,
3,E_0d580c719e7726,E_000001272c6c5d,18030,5430,18030,11,1,1,4,4,11,0.29,0.29,0.22,0.79,0.61,0.78,0.36,4,4,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.09,0.0,1,1,0,0,1,297.867401,444.857758,80,109,80,109,0.483397,0.412613,,,,4,0.5,11,,0,
4,E_0e7fa023e1b41f,E_000001272c6c5d,18030,4020,18030,11,1,1,4,4,5,0.44,0.44,0.22,0.56,0.28,0.5,0.8,4,4,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,1,3,0.0,0.09,0.06,0.27,0.18,0.65,0.33,0,0,0,0,1,297.867401,444.857758,66,80,80,133,0.48391,0.427897,0.0,0.0,0.0,9,0.722222,6,0.823529,0,


<IPython.core.display.Javascript object>

In [24]:
# import matplotlib.pyplot as plt

# plt.scatter(df_merged["angular_distance_min"], df_merged["dist_r1"])

<IPython.core.display.Javascript object>

In [25]:
if DEBUG:
    corrs = df_merged.corr()
    for col in corrs.columns:
        close = pd.DataFrame(corrs.loc[corrs[col] > 0.99][col])
        if len(close) > 1:
            display(close[close[col] < 1])

<IPython.core.display.Javascript object>

In [26]:
if not DEBUG:
    if IS_TEST:
        df_merged.to_csv(OUT_PATH + "features_test_1.csv", index=False)
    else:
        df_merged.to_csv(OUT_PATH + "features_train_1.csv", index=False)

<IPython.core.display.Javascript object>