#### Code to train classification models
**TODO**:
- Missing values ?
- Switch to fp32 / fp16 if memory issues
- Categorical features
- Use it to filter out obvious non-matches

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
import torch

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
torch.cuda.get_device_name(0)

In [None]:
import os
import gc
import ast
import glob
import json
import cudf
import pylcs
import torch
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize

pandarallel.initialize(progress_bar=False, use_memory_fs=False)
pd.options.display.max_columns = 100

In [None]:
from params import *

from data.preparation import prepare_train_data, prepare_triplet_data

from utils.logger import prepare_log_folder, create_logger, save_config
from utils.metrics import *
from inference.knn import *

### Params

In [None]:
EXP_FOLDER = LOG_PATH + "2022-05-19/4/"  # 1 ep, d=256, large

N_NEIGHBORS = 50

FOLD = 0

PRECOMPUTED = True

## Data

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
if not PRECOMPUTED:
    df = prepare_train_data(root=DATA_PATH)
#     df = cudf.from_pandas(df)

    folds = pd.read_csv(DATA_PATH + "folds_2.csv")[['id', 'fold']]
    df = df.merge(folds, how="left", on="id").set_index("id")

    df = df[df['fold'] == FOLD]
    df['idx'] = np.array(range(len(df)))
    
    df = cudf.from_pandas(df)
    
    df_p = cudf.read_csv(EXP_FOLDER + f'df_pairs_{N_NEIGHBORS}.csv')

    df_p = df_p.merge(df, how="left", left_on="id_1", right_on="id")
    df_p = df_p.merge(df, how="left", left_on="id_2", right_on="id", suffixes=("_1", "_2"))
    
    nn_preds = np.load(EXP_FOLDER + f"fts_val_{FOLD}.npy").astype(np.float16)

In [None]:
gt_matches = json.load(open(DATA_PATH + "gt.json", 'r'))

## Features
- Add features based on model preds
- Categories clusters using pois (leaky ?)

In [None]:
import pylcs
import difflib
import Levenshtein

def compute_lcs(a, b):
    return pylcs.lcs(a, b)

def compute_gesh(a, b):
    return difflib.SequenceMatcher(None, a, b).ratio()

def compute_levenshtein(a, b):
    return Levenshtein.distance(a, b)

def compute_jaro(a, b):
    return Levenshtein.jaro_winkler(a, b)

def compute_levenshtein_n(a, b):
    return Levenshtein.distance(a, b) / max(len(a), len(b))

def compute_string_distance(fct, a, b):
    if a != "" and b != "":
        return fct(a.lower(), b.lower())
    else:
        return np.nan

In [None]:
def normalize(ft, a, b):
    if a != "" and b != "":
        return ft / max(len(a), len(b))
    else:
        return ft

In [None]:
def haversine_distance(lats1, lats2, longs1, longs2):
    dlat = np.radians(lats2 - lats1)
    dlon = np.radians(longs2 - longs1)

    a = (
        np.sin(dlat / 2) ** 2 + 
        np.cos(np.radians(lats1)) * np.cos(np.radians(lats2)) * np.sin(dlon / 2) ** 2
    )
    dist = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return 6371 * dist


def manhattan_distance(lat1, long1, lat2, long2):
    return np.abs(lat2 - lat1) + np.abs(long2 - long1)


def euclidian_distance(lat1, long1, lat2, long2):
    return np.sqrt((lat2 - lat1) ** 2 + (long2 - long1) ** 2)

In [None]:
def is_included(a, b):
    if a == "" or b == "":
        return np.nan
    else:
        return a.lower() in b.lower() or b.lower() in a.lower()

def is_equal(a, b):
    if a == "" or b == "":
        return np.nan
    else:
        return a.lower() == b.lower()

In [None]:
def tf_idf_similarity(pairs, matrix):
    i1s = pairs['idx_1'].values.tolist()
    i2s = pairs['idx_2'].values.tolist()    
    
    sims = matrix[i1s].multiply(matrix[i2s]).sum(axis=1).ravel()
    return sims.get()

In [None]:
def nn_distance(pairs, matrix):
    i1s = pairs['idx_1'].values.tolist()
    i2s = pairs['idx_2'].values.tolist()

    return ((matrix[i1s] - matrix[i2s]) ** 2).mean(1)

## Compute

### Init

In [None]:
string_dist_fcts = {
    "lcs": compute_lcs,
    "gesh": compute_gesh,
    "levenshtein": compute_levenshtein,
    "jaro": compute_jaro,
}

TO_NORMALIZE = ["lcs", "levenshtein"]

TF_IDF_COLS = ['name', 'categories', 'address', 'url']
STRING_DIST_COLS = ['name', "categories", 'address', 'url', ]

FEATURES_SAME = [
    ('country', is_equal),
    ('state', is_equal),
    ('zip', is_included),
    ('phone', is_included),
    ('city', is_included),
    ('categories', is_included),
]

FEATURES = []

### NN Features

In [None]:
if not PRECOMPUTED:
    df_p['nn_dist'] = nn_distance(df_p, nn_preds)
    FEATURES.append('nn_dist')

### Tf-idf

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

if not PRECOMPUTED:
    tf_idf_mats = {}

    for col in TF_IDF_COLS:
        tf_idf = TfidfVectorizer()
        tf_idf_mat = tf_idf.fit_transform(df[col].fillna('noname'))
        tf_idf_mats[col] = tf_idf_mat

In [None]:
if not PRECOMPUTED:
    for col in TF_IDF_COLS:
        print(f'Computing feature {col}_tf_idf_sim')
        df_p[f"{col}_tf_idf_sim"] = tf_idf_similarity(df_p, tf_idf_mats[col])
        FEATURES.append(f"{col}_tf_idf_sim")

### Inclusion / equality features

In [None]:
if not PRECOMPUTED:
    try:
        df_p = df_p.to_pandas()
        df_p = reduce_mem_usage(df_p)
    except AttributeError:
        print('df_p already in pandas')

In [None]:
if not PRECOMPUTED:
    for col, fct in FEATURES_SAME:
        print(f'Computing feature same_{col}')
        df_p[f"same_{col}"] = df_p[[f"{col}_1", f"{col}_2"]].parallel_apply(
            lambda x: fct(x[0], x[1]), axis=1
        ).astype(float)

        FEATURES.append(f"same_{col}")

### String dists

In [None]:
if not PRECOMPUTED:
    for col in tqdm(STRING_DIST_COLS):
        for fct_name in string_dist_fcts:
            print(f"Column : {col}  -  Function : {fct_name}")
            df_p[col + "_" + fct_name] = df_p[[col + "_1", col + "_2"]].parallel_apply(
                lambda x: compute_string_distance(string_dist_fcts[fct_name], x[0], x[1]), axis=1
            )
            FEATURES.append(col + "_" + fct_name)

            if fct_name in TO_NORMALIZE:
                df_p[col + "_" + fct_name + "_n"] = df_p[
                    [col + "_" + fct_name, col + "_1", col + "_2"]
                ].parallel_apply(
                    lambda x: normalize(x[0], x[1], x[2]), axis=1
                )
                FEATURES.append(col + "_" + fct_name + "_n")

### Position features

In [None]:
if not PRECOMPUTED:
    lats_1, longs_1, lats_2, longs_2 = np.hsplit(
        df_p[['latitude_1', 'longitude_1', 'latitude_2', 'longitude_2']].values, 4
    )

    df_p['longitude_diff'] = np.abs(longs_2 - longs_1)
    df_p['latitude_diff'] = np.abs(lats_2 - lats_1)
    df_p['haversine_distance'] = haversine_distance(lats_1, longs_1, lats_2, longs_2)
    df_p['manhattan_distance'] = manhattan_distance(lats_1, longs_1, lats_2, longs_2)

    df_p['euclidian_distance'] = euclidian_distance(lats_1, longs_1, lats_2, longs_2)
    df_p['euclidian_distance'] = np.clip(df_p['euclidian_distance'], 0, 10000)

    FEATURES += [
        'longitude_diff', 'latitude_diff', 'haversine_distance', 'manhattan_distance', 'euclidian_distance'
    ]

### Load

In [None]:
TO_KEEP = ['id_1', 'id_2', 'point_of_interest_1', 'point_of_interest_2', 'match'] + FEATURES

if not PRECOMPUTED:
    df_p.drop([c for c in df_p.columns if c not in TO_KEEP], axis=1, inplace=True)
    # df_p.to_csv(EXP_FOLDER + f'df_p_{N_NEIGHBORS}.csv', index=False)
else:
    df_p = cudf.read_csv(EXP_FOLDER + f'df_p_{N_NEIGHBORS}.csv')
    FEATURES = [col for col in df_p.columns[5:] if "fold" not in col]
    
df_p['match'] = df_p['match'].astype(int)

## Folds

TODO : non-leaky splits by considering pairs: 
- gkf on poi, val set considers left and right pois -> (x1, x2) if x1 is in fold 1 val, x2 can be seen during training but not with any element of same poi as x1. This should not be leaky ?
- split before looking for pairs ? No bc it's important to have a 600k set to look for pairs in.

In [None]:
from sklearn.model_selection import GroupKFold

N_SPLITS = 5

if "fold_1" not in df_p.columns:
    if not os.path.exists(DATA_PATH + f"folds_{FOLD}_{N_SPLITS}.csv"):
        gkf = GroupKFold(n_splits=N_SPLITS)
        splits = gkf.split(df, groups=df['point_of_interest'])

        df_split = df.reset_index()[['id', 'point_of_interest']]
        df_split['fold'] = -1

        for i, (_, val_idx) in enumerate(splits):
            df_split.loc[val_idx, 'fold'] = i

        df_split.to_csv(DATA_PATH + f"folds_{FOLD}_{N_SPLITS}.csv", index=False)
        
    df_split = pd.read_csv(DATA_PATH + f"folds_{FOLD}_{N_SPLITS}.csv")

#     df_split = cudf.read_csv(DATA_PATH + f"folds_{FOLD}_{N_SPLITS}.csv")  
#     try:
#         df_p = cudf.from_pandas(df_p)
#     except TypeError:
#         print('df_p already in pandas')
    
    df_p = df_p.merge(df_split[['id', 'fold']], how="left", left_on="id_1", right_on="id")
    df_p.drop('id', axis=1, inplace=True)
    df_p = df_p.merge(df_split[['id', 'fold']], how="left", left_on="id_2", right_on="id", suffixes=("_1", "_2"))
    df_p.drop('id', axis=1, inplace=True)
    df_p.drop([c for c in df_p.columns if c not in TO_KEEP + ['fold_1', 'fold_2']], axis=1, inplace=True)

    try:
        df_p.to_pandas().to_csv(EXP_FOLDER + f'df_p_{N_NEIGHBORS}.csv', index=False)
    except AttributeError:
        df_p.to_csv(EXP_FOLDER + f'df_p_{N_NEIGHBORS}.csv', index=False)

## Model


In [None]:
import optuna
from model_zoo.xgb import train_xgb, objective_xgb
from utils.plot import *

In [None]:
TRAIN_FCTS = {
#     "lgbm": train_lgbm,
    "xgb": train_xgb,
#     "xgb_rf": train_xgbrf,
}

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score


def k_fold(
    df,
    config,
    log_folder=None,
):
    train_fct = TRAIN_FCTS[config.model]

    ft_imps, models = [], []
    pred_oof = np.zeros(len(df))

    for fold in range(config.n_folds):
        print(f"\n-------------   Fold {fold + 1} / {config.n_folds}  -------------\n")

        df_train = df[(df['fold_1'] != fold) & (df['fold_2'] != fold)].reset_index(drop=True)
        df_val = df[(df['fold_1'] == fold) | (df['fold_2'] == fold)]
        
        val_idx = df_val.index.values if isinstance(df, pd.DataFrame) else df_val.index.values.get()

        pred_val, model = train_fct(
            df_train, df_val, None, config.features, config.target, params=config.params
        )

        pred_oof[val_idx] = pred_val
        ft_imp = pd.DataFrame(
            pd.Series(model.feature_importances_, index=config.features), columns=["importance"]
        )

        ft_imps.append(ft_imp)
        models.append(model)

        if log_folder is None:
            return pred_oof, models, ft_imp

        pickle.dump(model, open(log_folder + f'{config.model}_{fold}.pkl', 'wb'))

    y = df[config.target].values if isinstance(df, pd.DataFrame) else df[config.target].get()
    auc = roc_auc_score(y, pred_oof)
    print(f"\n Local CV is {auc:.4f}")

    ft_imp = pd.concat(ft_imps, axis=1).mean(1)
    ft_imp.to_csv(log_folder + f'ft_imp.csv')
    np.save(log_folder + "pred_oof.npy", pred_oof)

    return pred_oof, models, ft_imp

In [None]:
df_p = df_p.to_pandas()
df_p['euclidian_distance'] = np.clip(df_p['euclidian_distance'], 0, 10000)

In [None]:
OPTIMIZE = False
TRAIN = False
DEBUG = False

### Param tweaking

In [None]:
OPT_FOLD = 2

if OPTIMIZE:
    df_train_opt = df_p[(df_p['fold_1'] != OPT_FOLD) & (df_p['fold_2'] != OPT_FOLD)].reset_index(drop=True)
    df_val_opt = df_p[
        (df_p['fold_1'] == OPT_FOLD) | (df_p['fold_2'] == OPT_FOLD)
    ].reset_index(drop=True)

    study = optuna.create_study(direction="maximize")
    objective = lambda x: objective_xgb(x, df_train_opt, df_val_opt, FEATURES, "match")
    study.optimize(objective, n_trials=100)

    params = study.best_params
    print("Final params :\n", study.best_params)

else:
    params = {
        'max_depth': 10,
        'learning_rate': 0.05,
        'min_child_weight': 1,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        "colsample_bytree": 0.75,
        "subsample": 0.75,
    }

### Training

In [None]:
class Config:
    exp_folder = EXP_FOLDER

    fold = FOLD
    n_folds = 5
    n_neighbors = N_NEIGHBORS
    
    features = FEATURES
    target = "match"

    model = "xgb"
    params = params

In [None]:
if TRAIN:
    log_folder = None
    if not DEBUG:
        log_folder = prepare_log_folder(LOG_PATH + "lvl_2/")
        print(f'Logging results to {log_folder}')
        save_config(Config, log_folder + 'config')
        create_logger(directory=log_folder, name="logs.txt")

    pred_oof, models, ft_imp = k_fold(df_p, Config, log_folder=log_folder)

### Retrieve

In [None]:
EXP_FOLDER = LOG_PATH + "lvl_2/" + "2022-05-25/6/"

In [None]:
if not TRAIN:
    pred_oof = np.load(EXP_FOLDER + "pred_oof.npy")
    ft_imp = pd.read_csv(EXP_FOLDER + "ft_imp.csv").set_index('Unnamed: 0')

## Results

In [None]:
# try:
#     df_p = cudf.from_pandas(df_p)
# except:
#     print('df_p already on gpu')
#     pass

In [None]:
y = df_p[Config.target].values if isinstance(df_p, pd.DataFrame) else df_p[Config.target].get()

plot_confusion_matrix(
    pred_oof > 0.5,
    y,
    display_labels=['No Match', 'Match'],
#     normalize="pred"
)

plt.title(f"AUC = {roc_auc_score(y, pred_oof) :.4f}")
plt.show()

In [None]:
def preds_to_matches(preds, df, threshold=0.5):
    gpu = not isinstance(df_p, pd.DataFrame)

    identity = df[['id_1']].drop_duplicates(keep="first").copy()
    identity['id_2'] = identity['id_1']
    identity['pred'] = 1

    df['pred'] = preds
    df = df[df['pred'] > threshold].reset_index(drop=True)
    df = df[['id_1', 'id_2', 'pred']].reset_index(drop=True)
    
    if gpu:        
        df = cudf.concat([df, identity])
        dfg = df.groupby('id_1').agg(list).to_pandas()
    else:
        df = pd.concat([df, identity])
        dfg = df.groupby('id_1').agg(list)
    
    dfg['id_2'] = dfg['id_2'].apply(list)
    dfg['pred'] = dfg['pred'].apply(list)
    dfg = dfg.to_dict()
    return dfg['id_2'], dfg['pred']

In [None]:
preds, scores = preds_to_matches(pred_oof, df_p, threshold=0.01)

# preds, scores = preds_to_matches(df_p['match'].values.get(), df_p.copy(), threshold=0.5)  # Ref

In [None]:
print(f"CV IoU : {compute_iou(preds, gt_matches) :.4f}")

In [None]:
found_prop, missed = compute_found_prop(preds, gt_matches)

n_matches = sum([len(preds[k]) for k in preds])

print(f"Found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.")

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
sns.countplot(x=[min(len(gt_matches[k]), 10) for k in gt_matches if k in preds.keys()])
plt.title('Number of pred matches per id')

plt.subplot(1, 2, 2)
sns.countplot(x=[min(len(preds[k]), 10) for k in preds])
plt.title('Number of gt matches per id')
plt.show()

### Feature importance

In [None]:
plot_importances(ft_imp)

### FNs

In [None]:
df = prepare_train_data(root=DATA_PATH)

In [None]:
for i, id_ in enumerate(preds):
    if not len(list(missed[i])):
        continue

    print('Query')
    display(df.loc[[id_]])

    print('Target')
    display(df.loc[[g for g in gt_matches[id_] if g != id_]])

    print('Missed')
    display(df.loc[list(missed[i])])

#     print('Preds')
#     display(df.loc[preds_matches[df.index[i]]].head(5))

#     break
    print('-' * 50)
    
    if i > 1:
        break

### Post-processing

In [None]:
import copy

def limit_numbers(preds, scores, n=2):
    preds_pp = copy.deepcopy(preds)
    for k in preds:
        if len(preds[k]) > n:
            order = np.argsort(scores[k])
            preds_pp[k] = list(np.array(preds[k])[order[:n]])
        
    return preds_pp

In [None]:
preds_pp = limit_numbers(preds, scores, 20)

In [None]:
print(f"CV IoU : {compute_iou(preds_pp, gt_matches) :.4f}")

In [None]:
import copy

def post_process_matches(matches, mode="append"):
    new_matches = copy.deepcopy(matches)
    for k in matches:
        for m in matches[k]:
            if k not in new_matches[m]:
                if mode == "remove":
                    new_matches[k].remove(m)
                elif mode == "append":
                    new_matches[m].append(k)
                else:
                    raise NotImplementedError

    return new_matches

In [None]:
preds_pp = post_process_matches(preds, mode="append")

In [None]:
print(f"CV IoU : {compute_iou(preds_pp, gt_matches) :.4f}")

In [None]:
found_prop, missed = compute_found_prop(preds_pp, gt_matches)

n_matches = sum([len(preds_pp[k]) for k in preds_pp])

print(f"Found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.")