#### Code to train models

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
import ast
import glob
import json
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.notebook import tqdm
from numerize.numerize import numerize

os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [None]:
from params import *

from data.preparation import prepare_train_data, prepare_triplet_data
from data.dataset import TripletDataset
from data.tokenization import get_tokenizer

from model_zoo.models import SingleTransformer

from utils.logger import Config
from utils.torch import load_model_weights
from utils.metrics import *

from inference.knn import *

## Data

In [None]:
K = 2

In [None]:
df = prepare_train_data(root=DATA_PATH)

In [None]:
folds = pd.read_csv(DATA_PATH + f"folds_{K}.csv")[['id', 'fold']]

df = df.merge(folds, how="left", on="id").set_index("id")

In [None]:
gt_matches = json.load(open(DATA_PATH + "gt.json", 'r'))

## Position matches

### Perfs

In [None]:
FOLD_IDX = 0
df_val = df[df['fold'] == FOLD_IDX]

In [None]:
for max_dist in [0.5]:
    for n_neighbors in [20, 30, 40, 50]:
        print(f'\n -> n_neighbors={n_neighbors} - max_dist={max_dist}')

        dist_matches = get_nearest_neighbors(df_val, n_neighbors=n_neighbors, max_dist=max_dist)
        found_prop, missed = compute_found_prop(dist_matches, gt_matches)
        n_matches = sum([len(dist_matches[k]) - 1 for k in dist_matches])
        print(f'Found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')

        with open(OUT_PATH + f"dist_matches_{n_neighbors}_{FOLD_IDX}.json", "w") as f:
            json.dump(dist_matches, f)
            print("\n- Saved to ", OUT_PATH + f"dist_matches_{n_neighbors}_{FOLD_IDX}.json")

#         break
#     break

### FPs

In [None]:
n_neighbors = 100
max_dist = 0.5
lim = 20  # ??

all_fps = {}
for fold_idx in tqdm(range(K)):
    df_val = df[df['fold'] == fold_idx]
    dist_matches = get_nearest_neighbors(df_val, n_neighbors=n_neighbors)
    fps = {id_ : " ".join(list(set(dist_matches[id_]) - set(gt_matches[id_]))[:lim]) for id_ in dist_matches}
    all_fps.update(fps)  # TODO : update ?


In [None]:
df_fps = pd.DataFrame.from_dict(all_fps, orient="index").reset_index()
df_fps.columns = ["id", 'fp_ids']

triplets = prepare_triplet_data(root=DATA_PATH)
triplets.drop('fp_ids', axis=1, inplace=True)

triplets = triplets.merge(df_fps, how="left")
triplets.to_csv(DATA_PATH + 'triplets_v2.csv', index=False)

In [None]:
triplets.isna().max()

## Phone number
- Count encoding for phone numbers present too many times ?
- Phone found twice = match ?

In [None]:
FOLD_IDX = 0
df_val = df[df['fold'] == FOLD_IDX]

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(x=df_val['phone'].apply(len))
plt.yscale('log')

In [None]:
df_phone = df_val[df_val['phone'].apply(len) > 0]
df_phone = df_phone[df_phone['phone'].apply(len) > 5]
df_phone = df_phone[df_phone['phone'].apply(len) < 25]

In [None]:
def find_phone_matches(id_, df):
    number = df['phone'][id_]

#     matches = list(df[df['phone'] == number].index)
    matches = list(df[df['phone'].apply(lambda x: x in number or number in x)].index)
    matches.remove(id_)
    return matches

In [None]:
phone_matches = {}

for country, df_phone_c in tqdm(df_phone.groupby("country")):
#     print(country, len(df_phone_c))
    if country == "US":
        # Group by state
        for state, df_phone_c_s in tqdm(df_phone_c.groupby("state")):
            for id_ in df_phone_c_s.index:
                m = find_phone_matches(id_, df_phone_c_s)
                if len(m):
                    phone_matches[id_] = m
    else:
        for id_ in df_phone_c.index:
            m = find_phone_matches(id_, df_phone_c)
            if len(m):
                phone_matches[id_] = m

In [None]:
for i in range(10):
    k = list(phone_matches.keys())[i]
    k = np.random.choice(list(phone_matches.keys()))
    display(df.loc[[k] + phone_matches[k]])

In [None]:
for n_neighbors in [20, 30, 40, 50]:
    print(f'\n -> n_neighbors={n_neighbors}\n')

    position_matches = json.load(open(OUT_PATH + f"dist_matches_{n_neighbors}_0.json", 'r'))
    found_prop, missed_pos = compute_found_prop(position_matches, gt_matches)
    n_matches = sum([len(position_matches[k]) - 1 for k in position_matches])
    print(f'Position :\t Found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')

    merged_matches = {k : list(set(position_matches[k] + phone_matches.get(k, []))) for k in preds_matches}
    found_prop, missed = compute_found_prop(merged_matches, gt_matches)
    n_matches = sum([len(merged_matches[k]) - 1 for k in merged_matches])
    print(f'Merged :\t found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')

## Url
- Count encoding for phone numbers present too many times ?

In [None]:
FOLD_IDX = 0
df_val = df[df['fold'] == FOLD_IDX]

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(x=df_val['url'].apply(len))
plt.yscale('log')

In [None]:
df_url = df_val[df_val['url'].apply(len) > 0]
df_url = df_url[df_url['url'].apply(len) > 20]
len(df_url)

In [None]:
def find_url_matches(id_, df):
    url = df['url'][id_]

#     matches = list(df[df['url'] == url].index)
    matches = list(df[df['url'].apply(lambda x: x in url or url in x)].index)
    matches.remove(id_)
    return matches

In [None]:
url_matches = {}

for country, df_url_c in tqdm(df_url.groupby("country")):
    if country == "US":
        # Group by state
        for state, df_url_c_s in tqdm(df_url_c.groupby("state")):
            for id_ in df_url_c_s.index:
                m = find_url_matches(id_, df_url_c_s)
                if len(m):
                    url_matches[id_] = m
    else:
        for id_ in df_url_c.index:
            m = find_url_matches(id_, df_url_c)
            if len(m):
                url_matches[id_] = m

In [None]:
for i in range(1):
    k = list(url_matches.keys())[i]
    k = np.random.choice(list(url_matches.keys()))
    display(df.loc[[k] + url_matches[k]])

In [None]:
for n_neighbors in [20, 30, 40, 50]:
    print(f'\n -> n_neighbors={n_neighbors}\n')

    position_matches = json.load(open(OUT_PATH + f"dist_matches_{n_neighbors}_0.json", 'r'))
    found_prop, missed_pos = compute_found_prop(position_matches, gt_matches)
    n_matches = sum([len(position_matches[k]) - 1 for k in position_matches])
    print(f'Position :\t Found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')

    merged_matches = {k : list(set(position_matches[k] + phone_matches.get(k, []))) for k in preds_matches}
    found_prop, missed = compute_found_prop(merged_matches, gt_matches)
    n_matches = sum([len(merged_matches[k]) - 1 for k in merged_matches])
    print(f'Pos+Phone :\t found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')

    merged_matches = {k : list(set(
        position_matches[k] + url_matches.get(k, []) + phone_matches.get(k, [])
    )) for k in preds_matches}

    found_prop, missed = compute_found_prop(merged_matches, gt_matches)
    n_matches = sum([len(merged_matches[k]) - 1 for k in merged_matches])
    print(f'Pos+Phone+Url :\t found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')
    
    with open(OUT_PATH + f"dist-phone-url_matches_{n_neighbors}_{FOLD_IDX}.json", "w") as f:
        json.dump(merged_matches, f)
        print("\n- Saved to ", OUT_PATH + f"dist-phone-url_matches_{n_neighbors}_{FOLD_IDX}.json")