In [None]:
UTIL_PATH = "../input/4sq-catboost-models"
DF_PATH = "../input/foursquare-location-matching/test.csv"
SUB_PATH = "../input/foursquare-location-matching/sample_submission.csv"

In [None]:
import numpy as np
import pandas as pd

import sys
sys.path.insert(0, UTIL_PATH) 

from fsq_utils import *


from catboost import CatBoostClassifier
from tqdm import tqdm

## Extracting Nearest Neighbors

In [None]:
df = pd.read_csv(DF_PATH).reset_index()
coo_cols = ["latitude", "longitude"]

N = 12 if len(df)>30000 else 2

distances, indices = calc_dists_and_indices(df=df,
                                            N=N,
                                            cols=coo_cols)

## Extracting Essential Similarities

In [None]:
text_cols = ["address", "url", "phone", "name"]
V_textcols = textcol_tfidf(df=df,
                            cols=text_cols)

V_cat = cat_tfidf(df=df)

ids, match_ids, candidate_df = calc_essential_feats(df=df,
                                                     indices=indices,
                                                     distances=distances,
                                                     textcol_tfidf=V_textcols,
                                                     cat_tfidf=V_cat,
                                                     train_mode=False)

## Extracting Word Similarity Stats
We are using all cores in here. It affects the execution speed so much! ⚡

In [None]:
STR_FE_COLS = ['name', 'categories']
PAIR_COLS = ['name', 'categories']
candidate_df = extract_features(candidate_df, PAIR_COLS)

## CV Prediction

In [None]:
def pred_w_model(foldnum, df_x):
    model = CatBoostClassifier()
    model.load_model(UTIL_PATH+'/cb_fold_'+str(foldnum))
    pred = model.predict_proba(df_x)[:,1]
    del model
    gc_clear()
    return pred

preds = [pred_w_model(i, candidate_df) for i in tqdm(range(7))] 
thresholds = np.load(UTIL_PATH+"/fold_threshs.npy")
thresholds

## Voting with Threshold-Tuning

In [None]:
candidate_df["pred"] = (
    np.mean(
        [
            (preds[pred_i] >= thresholds[pred_i]).astype(int)
            for pred_i, pred in enumerate(preds)
        ],
        axis=0,
    )
    >= 0.5
).astype(int)
candidate_df['ids']=ids
candidate_df['match_id']=match_ids

## Creating the submission

In [None]:
pred_match_mask = candidate_df.pred==1
matches_list = dict([(ids, list(groupby_df.match_id.values)) for ids, groupby_df in candidate_df[pred_match_mask].groupby("ids")])

In [None]:
subm_df=pd.read_csv(SUB_PATH)
subm_df["matches"] = np.nan

In [None]:
def set_setter(selected_id):
    try:
        return " ".join(matches_list[selected_id])
    except:
        return ""

subm_df['matches'] = subm_df['id'].progress_apply(lambda x: set_setter(x))

### Adding the self match

In [None]:
subm_df['matches'] = subm_df['id'] + " " + subm_df['matches']

## The submission

In [None]:
subm_df

In [None]:
subm_df.to_csv("submission.csv", index=False)