In [None]:
DEBUG = True

#### Code to perform inference

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

import torch
torch.cuda.get_device_name(0)

In [None]:
import os
import gc
import ast
import cudf
import glob
import json
import torch
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize
from sklearn.metrics import roc_auc_score
from cuml.feature_extraction.text import TfidfVectorizer

pandarallel.initialize(progress_bar=False, use_memory_fs=False)
pd.options.display.max_columns = 500

In [None]:
from params import *

from inference.knn import *
from inference.predict import predict

from data.features import *
from data.preparation import *
from data.post_processing import *
from data.dataset import SingleDataset
from data.tokenization import get_tokenizer

from model_zoo.models import SingleTransformer

from utils.logger import Config
from utils.torch import load_model_weights
from utils.metrics import *

## Data & params

In [None]:
if DEBUG:
    # df = cudf.read_csv(DATA_PATH + "test.csv").set_index('id')
    df = cudf.read_csv(DATA_PATH + "train.csv").set_index('id')
    folds = cudf.read_csv(DATA_PATH + "folds_2.csv")[['id', 'fold']]
    df = df.merge(folds, how="left", on="id").set_index("id")

    df = df[df['fold'] == 0]
    
    gt_matches = build_gt(df.reset_index().to_pandas(), save=False)
else:
    df = cudf.read_csv(DATA_PATH + "test.csv").set_index('id')
    gt_matches = None
    
df.sort_index(inplace=True)

In [None]:
# MAX_DIST = None
MAX_DIST = 0.5
NEIGHBORS = 20

In [None]:
CONVERT_JAP = True

In [None]:
OUT_PATH = "../output/"

# Matches

## Position matches

In [None]:
dist_matches = get_nearest_neighbors(df, n_neighbors=NEIGHBORS, max_dist=MAX_DIST)

## Phone matches

In [None]:
df['phone_len'] = df[['phone']].to_pandas()['phone'].fillna('').apply(len)
df_phone = df[df['phone_len'] > 5]
df_phone = df_phone[df_phone['phone_len'] < 25]

df_phone = df_phone.to_pandas()

In [None]:
phone_matches = {}

for country, df_phone_c in tqdm(df_phone.groupby("country")):
    if country == "US":
        # Group by state
        for state, df_phone_c_s in tqdm(df_phone_c.groupby("state")):
            for id_ in df_phone_c_s.index:
                m = find_phone_matches(id_, df_phone_c_s)
                if len(m):
                    phone_matches[id_] = m
    else:
        for id_ in df_phone_c.index:
            m = find_phone_matches(id_, df_phone_c)
            if len(m):
                phone_matches[id_] = m

In [None]:
naive_matches = {
    k : matches + [m for m in phone_matches.get(k, []) if m not in matches] for k, matches in dist_matches.items()
}

In [None]:
if DEBUG:
    found_prop, missed_pos = compute_found_prop(naive_matches, gt_matches)
    n_matches = sum([len(naive_matches[k]) for k in naive_matches])
    print(f'Found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')

## Matching NNs

In [None]:
NN_FT_FOLDERS = [
#     ("xlm-large", LOG_PATH + "2022-05-19/4/"),            # 1 ep, d=256, large
#     ("roberta", LOG_PATH + "2022-05-20/1/"),              # roberta-large
    ("xlm-base+url", LOG_PATH + "2022-05-20/2/"),         # base + url
#     ("xlm-large+noaddress", LOG_PATH + "2022-05-20/3/"),  # large + no address
]

FOLD = 0

In [None]:
df_nn = prepare_nn_data(df)

assert (df_nn.index == df.index.to_pandas()).all(), "Indexes do not match"

In [None]:
config.max_len

In [None]:
for name, EXP_FOLDER in NN_FT_FOLDERS:      
#     if OUT_PATH + f"fts_{name}.npy":
#         print(' -> Retrieved already computed embeddings :',  OUT_PATH + f"fts_{name}.npy \n")
#         continue

    config = Config(json.load(open(EXP_FOLDER + "config.json", 'r')))

    tokenizer = get_tokenizer(config.name)
    dataset = SingleDataset(
        df_nn,
        tokenizer,
        config.max_len,
        use_url="+url" in name,
        use_address=not "+noaddress" in name,
    )

    model = SingleTransformer(
        config.name,
        nb_layers=config.nb_layers,
        no_dropout=config.no_dropout,
        embed_dim=config.embed_dim,
        nb_features=config.nb_features,
    ).cuda()
    model.zero_grad()

    weights = sorted(glob.glob(EXP_FOLDER + "*.pt"))
    model = load_model_weights(model, weights[FOLD])

    preds = predict(model, dataset, config.data_config)

    np.save(OUT_PATH + f"fts_{name}.npy", preds)
    print(f' -> Saved features to "{OUT_PATH}fts_{name}.npy"\n')

    del preds, model, tokenizer, dataset
    gc.collect()
    torch.cuda.empty_cache()

### Matches

In [None]:
NAME = "xlm-large"
# NAME = "xlm-base+url"

preds = np.load(OUT_PATH + f'fts_{NAME}.npy')

In [None]:
nn_matches = find_matches(preds, df_nn, NEIGHBORS)

In [None]:
if DEBUG:
    found_prop, missed_pos = compute_found_prop(nn_matches, gt_matches)
    n_matches = sum([len(nn_matches[k]) for k in nn_matches])
    print(f'Found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')
    
    merged_matches = {k : list(set(naive_matches[k] + nn_matches[k])) for k in nn_matches}
    found_prop, missed_pos = compute_found_prop(merged_matches, gt_matches)
    n_matches = sum([len(merged_matches[k]) for k in merged_matches])
    print(f'Found {found_prop * 100 :.2f}% of matches with {numerize(n_matches)} candidates.')

In [None]:
df_pairs = create_pairs(nn_matches, naive_matches, NEIGHBORS, gt_matches=gt_matches)

df_pairs.to_csv(OUT_PATH + "pairs.csv", index=False)

# Classification

In [None]:
def feature_engineering(df, df_p, idx=0, save=False):
    features = []

    df_p = df_p.merge(df, how="left", left_on="id_1", right_on="id")
    df_p = df_p.merge(df, how="left", left_on="id_2", right_on="id", suffixes=("_1", "_2"))

    df_p.loc[df_p['rank'] == -1, 'rank'] = np.nan
    df_p.loc[df_p['rank_nn'] == -1, 'rank_nn'] = np.nan

    print('- Computing rank features')
    df_p['rank_nan'] = df_p["rank"].isna().astype(np.uint8)
    df_p['rank_nn_nan'] = df_p["rank_nn"].isna().astype(np.uint8)
    df_p["rank_both_nan"] = df_p[["rank_nan", "rank_nn_nan"]].min(axis=1)
    df_p["rank_any_nan"] = df_p[["rank_nan", "rank_nn_nan"]].max(axis=1)
        
    features += [
        "rank", "rank_nn", 
        "rank_nan", "rank_nn_nan", 
        "rank_both_nan", "rank_any_nan",
    ]

    print('- Computing nan features')
    features += compute_nan_features(df_p, NAN_COLS)
    
    for name, folder in NN_FT_FOLDERS:
        print(f'- Adding features using model {name}')
        nn_preds = np.load(OUT_PATH + f"fts_{name}.npy").astype(np.float16)
        nn_preds = torch.from_numpy(nn_preds).cuda()

        features += compute_nn_distances(df_p, nn_preds, suffix="_" + name)

        del nn_preds
        gc.collect()
        torch.cuda.empty_cache()
        
    print('- Computing position distances')
    features += compute_position_distances(df_p)
    
    for col in TF_IDF_COLS:
        for ngram_range, analyzer in TF_IDF_PARAMS:
            ft_name = f"{col}_tf_idf_{ngram_range[0]}{ngram_range[1]}_{analyzer}_sim"
            print(f'- Computing feature {ft_name}')

            tf_idf = TfidfVectorizer(use_idf=False, ngram_range=ngram_range, analyzer=analyzer)
            tf_idf_mat = tf_idf.fit_transform(df[col].fillna('nan'))

            df_p[ft_name] = tf_idf_similarity(df_p, tf_idf_mat)
            features.append(ft_name)

    if not isinstance(df_p, pd.DataFrame):
        df_p = df_p.to_pandas()
        
    for col, fct in FEATURES_SAME:
        print(f'- Computing feature same_{col}')
        df_p[f"same_{col}"] = df_p[[f"{col}_1", f"{col}_2"]].fillna('').parallel_apply(
            lambda x: fct(x[0], x[1]), axis=1
        ).astype(float)

        features.append(f"same_{col}")
        
    features += compute_string_distances(df_p, STRING_DIST_COLS, verbose=1)
    
    to_keep = ['id_1', 'id_2', 'point_of_interest_1', 'point_of_interest_2', 'match'] + features

    df_p.drop([c for c in df_p.columns if c not in to_keep], axis=1, inplace=True)
    df_p['match'] = df_p['match'].astype(int)
    
    if save:
        print('\n -> Saving features to :', OUT_PATH + f"df_p_{idx}.csv")
        df_p.to_csv(OUT_PATH + f"df_p_{idx}.csv")
    
    return df_p, features

In [None]:
def inference_xgb(df_p, exp_folder, debug=False):
    
    pred_test = np.zeros(len(df_p))
    model_paths = sorted(glob.glob(EXP_FOLDER + "*.pkl"))

    for fold, model_path in enumerate(model_paths):
        print(f'- Model {model_path.split("/")[-1].split(".")[0]} ')
        config = Config(json.load(open(EXP_FOLDER + "config.json", 'r')))

        model = pickle.load(open(model_path, 'rb'))    

        if debug:
            df_val = df_p[(df_p["fold_1"] == fold) | (df_p["fold_2"] == fold)]

            val_idx = (
                df_val.index.values if isinstance(df_val, pd.DataFrame) else df_val.index.values.get()
            )
            pred_test[val_idx] = model.predict_proba(df_val[config.features])[:, 1]
        else:
            pred_test += model.predict_proba(df_p[config.features])[:, 1] / len(model_paths)
            
    return pred_test

## Data

In [None]:
pairs = cudf.read_csv(OUT_PATH + "pairs.csv")# .sort_values('id_1', 'id_2')

In [None]:
try:
    df = df.to_pandas().copy()
except:
    pass

if CONVERT_JAP:
    df = convert_japanese_alphabet(df)

# df = reduce_mem_usage(df)
df['idx'] = np.array(range(len(df)))

df = cudf.from_pandas(df)

## Params
- todo: fit tf-idf mats only once

In [None]:
NAN_COLS = ['address', 'city', 'state', 'zip', 'url', 'phone']

TF_IDF_COLS = ['name', 'categories', 'address', 'url']

TF_IDF_PARAMS = [
    ((1, 1), 'word'),  # word unigrams
    ((3, 3), 'char_wb'),  # char trigrams
]

STRING_DIST_COLS = ['name', "categories", 'address', 'url', 'phone']

FEATURES_SAME = [
    ('country', is_equal),
    ('state', is_equal),
    ('zip', is_included),
    ('phone', is_included),
    ('city', is_included),
    ('categories', is_included),
]

In [None]:
NN_FT_FOLDERS = [
    ("xlm-large", LOG_PATH + "2022-05-19/4/"),            # 1 ep, d=256, large
#     ("roberta", LOG_PATH + "2022-05-20/1/"),              # roberta-large
#     ("xlm-base+url", LOG_PATH + "2022-05-20/2/"),         # base + url
    ("xlm-large+noaddress", LOG_PATH + "2022-05-20/3/"),  # large + no address
]

In [None]:
FT_GROUPS = [
    "nn_dist_l1_*",
    "nn_dist_l2_*",
    "nn_cosine_sim_*",
]

In [None]:
# BATCH_SIZE = 3000000
BATCH_SIZE = 15000000
# BATCH_SIZE = 5000
BATCHES = list(np.arange(0, len(pairs), BATCH_SIZE)) + [len(pairs)]

In [None]:
EXP_FOLDER = LOG_PATH + "lvl_2/" + "2022-06-21/2/"  # 0.8714 - xgb 20 neighbors fix fewer nn
FOLD = 0
N_SPLITS = 5

## Main

In [None]:
for i in tqdm(range(len(BATCHES) - 1)):
    print(f' -> Indices {BATCHES[i]} -> {BATCHES[i + 1]}\n')
    pairs_ = pairs.iloc[range(BATCHES[i], BATCHES[i + 1])]

    print('# Feature engineering \n')
    df_p, features = feature_engineering(df, pairs_, idx=i, save=False)
    
    print('\n# Inference \n')

    if DEBUG:  # retrieve folds
        df_split = pd.read_csv(DATA_PATH + f"folds_{FOLD}_{N_SPLITS}.csv")

        df_p = df_p.merge(df_split[['id', 'fold']], how="left", left_on="id_1", right_on="id")
        df_p.drop('id', axis=1, inplace=True)
        df_p = df_p.merge(
            df_split[['id', 'fold']], how="left", left_on="id_2", right_on="id", suffixes=("_1", "_2")
        )
        df_p.drop('id', axis=1, inplace=True)
    
    pred_test = inference_xgb(df_p, EXP_FOLDER, debug=DEBUG)

    if DEBUG:
        print(f'\nAUC : {roc_auc_score(df_p["match"], pred_test) :.4f}')

    df_p['preds'] = pred_test
    df_p[["id_1", "id_2", "preds", "match"]].to_csv(OUT_PATH + f"df_preds_{i}.csv")
    
#     del df_p
#     gc.collect()

#     break

## Results


In [None]:
df_preds = pd.concat([
    pd.read_csv(OUT_PATH + f"df_preds_{i}.csv").set_index("Unnamed: 0")
    for i in range(len(BATCHES) - 1)], 0
)

In [None]:
roc_auc_score(df_preds['match'], df_preds['preds'])

In [None]:
ids = list(df.index.to_pandas().values)

In [None]:
THRESHOLD = 0.55

In [None]:
preds, scores = preds_to_matches(df_preds['preds'], df_preds, threshold=THRESHOLD, ids=ids)

In [None]:
print(f"CV IoU : {compute_iou(preds, gt_matches) :.4f}")

In [None]:
preds_pp = post_process_matches(preds, mode="append")

print(f"CV IoU : {compute_iou(preds_pp, gt_matches) :.4f}")

In [None]:
sub = pd.DataFrame.from_dict({k : " ".join(v) for k, v in preds_pp.items()}, orient="index")
sub.columns = ["matches"]

sub.to_csv(OUT_PATH + "submission.csv", index=False)