# About this note book
Save your upvote for original author [ryotayoshinobu](https://www.kaggle.com/ryotayoshinobu): https://www.kaggle.com/code/ryotayoshinobu/foursquare-lightgbm-baseline<br>
and [KhanhVD](https://www.kaggle.com/duykhanh99) for [reduce memory helper function](https://www.kaggle.com/competitions/foursquare-location-matching/discussion/321520)

I tried to run LightGBM on GPU but it's not work yet, so I tried to implement on XGBoost.<br>
You can run experiments on your own, have fun kaggling. 🤗🤗🤗

In [None]:
import xgboost
xgboost.__version__

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import seaborn as sns
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib

In [None]:
class CFG:
    seed = 46
    target = "point_of_interest"
    n_neighbors = 10
    n_splits = 3
    save_dir = './'
    load_train_test_dir =  '../input/foursquaredemoxgboost/' # '/content/drive/MyDrive/Kaggle_Competitions/Foursquare_Location_Matching_2022/Runs/GBM_Baseline'
    expID = ""
    if "google.colab" in sys.modules:
        expID = get("http://172.28.0.2:9000/api/sessions").json()[0]["name"].split(".")[0]

!mkdir {CFG.save_dir}
random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)
np.random.seed(CFG.seed)

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/foursquare-location-matching/train.csv")
test = pd.read_csv("../input/foursquare-location-matching/test.csv")
test[CFG.target] = "TEST"

# train.head(1)

# Devide Train Data into about 600K×2

In [None]:
# kf = GroupKFold(n_splits=2)
# for i, (trn_idx, val_idx) in enumerate(kf.split(train, train[CFG.target], train[CFG.target])):
#     train.loc[val_idx, "set"] = i
# train["set"].value_counts()

# Search Candidates

In [None]:
from sklearn.neighbors import KNeighborsRegressor

def add_neighbor_features(df):
    dfs = []
    columns = ['id', 'name', 'address', 'city', 'state',
           'zip', 'country', 'url', 'phone', 'categories']
    for c in columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()

    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), CFG.n_neighbors), 
                                  metric='haversine', n_jobs=-1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)

        targets = country_df[CFG.target].values
        for i in range(min(len(country_df), CFG.n_neighbors)):
            country_df[f"d_near_{i}"] = dists[:, i]
            country_df[f"near_target_{i}"] = targets[nears[:, i]]
            for c in columns:
                country_df[f"near_{c}_{i}"] = country_df[c].values[nears[:, i]]

        for i in range(min(len(country_df), CFG.n_neighbors), CFG.n_neighbors):
            country_df[f"d_near_{i}"] = np.nan
            country_df[f"near_target_{i}"] = np.nan
            for c in columns:
                country_df[f"near_{c}_{i}"] = np.nan

        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

# train = pd.concat([
#     add_neighbor_features(train[train["set"]==0]), 
#     add_neighbor_features(train[train["set"]==1]), 
# ])
test = add_neighbor_features(test)

# train.head(1)

# Create Target

In [None]:
# for i in range(CFG.n_neighbors):
#     train.loc[train[CFG.target]==train[f"near_target_{i}"], "target"] = i
# train.head()

In [None]:
# plt.hist(train["target"], bins=sorted(train["target"].unique()))
# plt.grid()
# plt.xlabel("target")
# plt.show()

# Check Maximum Score

In [None]:
# # https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

# id2poi = get_id2poi(train)
# poi2ids = get_poi2ids(train)

In [None]:
# scores = []

# train["matches"] = ""
# for i in tqdm(range(CFG.n_neighbors)):
#     idx = train[CFG.target]==train[f"near_target_{i}"]
#     train.loc[idx, "matches"] += " " + train.loc[idx, f"near_id_{i}"]
#     scores.append(get_score(train))
# train["mathces"] = None

In [None]:
# plt.subplots(figsize=(8, 3), facecolor="white")
# plt.plot(range(CFG.n_neighbors), scores, marker="o")
# plt.grid()
# plt.xlabel("# of candidates")
# plt.ylabel("Maximum Score")
# plt.ylim([0.6, 1.0])
# plt.show()

In [None]:
# del train
gc.collect()

# Feature Engineering

In [None]:
if "google.colab" in sys.modules:
    !pip install Levenshtein

In [None]:
%load_ext Cython

In [None]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [None]:
import Levenshtein
import difflib

def _add_distance_features(args):
    _, df = args

    columns = ['name', 'address', 'city', 'state',
           'zip', 'country', 'url', 'phone', 'categories']

    for i in tqdm(range(CFG.n_neighbors)):
        for c in columns:
            geshs = []
            levens = []
            jaros = []
            lcss = []
            for str1, str2 in df[[f"near_{c}_0", f"near_{c}_{i}"]].values.astype(str):
                if str1==str1 and str2==str2:
                    geshs.append(difflib.SequenceMatcher(None, str1, str2).ratio())
                    levens.append(Levenshtein.distance(str1, str2))
                    jaros.append(Levenshtein.jaro_winkler(str1, str2))
                    lcss.append(LCS(str(str1), str(str2)))
                else:
                    geshs.append(-1)
                    levens.append(-1)
                    jaros.append(-1)
            df[f"near_{c}_{i}_gesh"] = geshs
            df[f"near_{c}_{i}_leven"] = levens
            df[f"near_{c}_{i}_jaro"] = jaros
            df[f"near_{c}_{i}_lcs"] = lcss
            
            if not c in ['country', "phone", "zip"]:
                df[f"near_{c}_{i}_len"] = df[f"near_{c}_{i}"].astype(str).map(len)
                df[f"near_{c}_{i}_nleven"] = df[f"near_{c}_{i}_leven"] / df[[f"near_{c}_{i}_len", f"near_{c}_0_len"]].max(axis=1)
                df[f"near_{c}_{i}_nlcsi"] = df[f"near_{c}_{i}_lcs"] / df[f"near_{c}_{i}_len"]
                df[f"near_{c}_{i}_nlcs0"] = df[f"near_{c}_{i}_lcs"] / df[f"near_{c}_0_len"]
    return df


def add_distance_features(df):
    processes = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_distance_features, df.groupby('country'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df

def reduce_mem_usage(df, verbose=True):
# Thanks KhanhVD for this helper function
# https://www.kaggle.com/competitions/foursquare-location-matching/discussion/321520
  numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  start_mem = df.memory_usage().sum() / 1024**2

  for col in df.columns:
      col_type = df[col].dtypes
      if col_type in numerics:
          c_min = df[col].min()
          c_max = df[col].max()
          if str(col_type)[:3] == 'int':
              if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                  df[col] = df[col].astype(np.int8)
              elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                  df[col] = df[col].astype(np.int16)
              elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                  df[col] = df[col].astype(np.int32)
              elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                  df[col] = df[col].astype(np.int64)
          else:
              if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                  df[col] = df[col].astype(np.float16)
              elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                  df[col] = df[col].astype(np.float32)
              else:
                  df[col] = df[col].astype(np.float64)

  end_mem = df.memory_usage().sum() / 1024**2
  print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
  print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

  return df

# train = add_distance_features(train)
test = add_distance_features(test)

# Delete Unusing Columns (just for avoiding OOM)

In [None]:
features = []

columns = ['name', 'address', 'city', 'state',
       'zip', 'country', 'url', 'phone', 'categories']
for i in tqdm(range(CFG.n_neighbors)):
    features.append(f"d_near_{i}")
    for c in columns:        
        features += [f"near_{c}_{i}_gesh", f"near_{c}_{i}_jaro", f"near_{c}_{i}_lcs"]
        if c in ['country', "phone", "zip"]:
            features += [f"near_{c}_{i}_leven"]
        else:
            features += [f"near_{c}_{i}_len", f"near_{c}_{i}_nleven", f"near_{c}_{i}_nlcsi", f"near_{c}_{i}_nlcs0"]

for f in features:
#     assert f in train.columns
    if f not in test.columns:
        test[f] = np.nan

# print(features)

In [None]:
# train = train[features + [CFG.target, "target", "id"] + [f"near_id_{i}" for i in range(CFG.n_neighbors)]]
test = test[features + ["id"] + [f"near_id_{i}" for i in range(CFG.n_neighbors)]]

# train[features] = train[features].astype(np.float16)
test[features] = test[features].astype(np.float16)

# train["target"] = train["target"].fillna(0)

# train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

for _ in range(5):
    gc.collect()

# train.info()

In [None]:
# import cudf
# if (CFG.load_train_test_dir!=""): # load precomputed data
#   train = pd.read_csv(f"{CFG.load_train_test_dir}/train.csv")
#   with open(f'{CFG.load_train_test_dir}/id2poi.obj', 'rb') as fp:
#     id2poi = pickle.load(fp)

#   with open(f'{CFG.load_train_test_dir}/poi2ids.obj', 'rb') as fp:
#     poi2ids = pickle.load(fp)

In [None]:
# reduce mems
# train = reduce_mem_usage(train)
# test = reduce_mem_usage(test)

In [None]:
# import pickle
# with open(f'{CFG.save_dir}/id2poi.obj', 'wb') as fp:
#   pickle.dump(id2poi, fp)

# with open(f'{CFG.save_dir}/poi2ids.obj', 'wb') as fp:
#   pickle.dump(poi2ids, fp)

# train.to_csv(f"{CFG.save_dir}/train.csv",encoding='utf-8')

# Split Folds

In [None]:
# kf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
# for i, (trn_idx, val_idx) in tqdm(enumerate(kf.split(train, train["target"], train["target"]))):
#     train.loc[val_idx, "fold"] = i

# Model Learning

In [None]:
import xgboost 

def fit_xgboost(X, y, params=None, es_rounds=20, seed=42, N_SPLITS=5, 
             n_class=None, model_dir=None, folds=None):
    models = []
    oof = np.zeros((len(y), n_class), dtype=np.float64)
    
    for i in tqdm(range(CFG.n_splits)):
        print(f"== fold {i} ==")
        trn_idx = folds!=i
        val_idx = folds==i
        X_train, y_train = X[trn_idx], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

        if model_dir is None:
            model = xgboost.XGBClassifier(**params)
            model.fit(
                X_train, y_train, 
                eval_set=[(X_valid, y_valid)],  
                early_stopping_rounds=es_rounds, 
                eval_metric='mlogloss',  
    #             verbose=-1)
                verbose=50)
        else:
            model = xgboost.XGBClassifier()
            model.load_model(f'{CFG.load_train_test_dir}/xgboost_fold{i}.json')
            
        pred = model.predict_proba(X_valid)
        oof[val_idx] = pred
        models.append(model)
        
        file = f'{CFG.save_dir}/xgboost_fold{i}.json'
#         pickle.dump(model, open(file, 'wb'))
        model.save_model(file)
        print()

    cv = (oof.argmax(axis=-1) == y).mean()
    print(f"CV-accuracy: {cv}")

    return oof, models

def inference_xgboost(models, feat_df):
    pred = np.array([model.predict_proba(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

In [None]:
params = {
    'tree_method':'gpu_hist', # train on gpu
    'predictor' : 'gpu_predictor', # inference on gpu
    'learning_rate': 0.2,
    'random_state': 42,
}

# oof, models = fit_xgboost(train[features], train["target"].astype(int), 
#                        params=params, n_class=int(train["target"].max() + 1), 
#                        N_SPLITS=CFG.n_splits, folds=train["fold"].values,
#                        )

models = []
for i in range(CFG.n_splits):
    model = xgboost.XGBClassifier()
    model.load_model(f'{CFG.load_train_test_dir}/xgboost_fold{i}.json')
    models.append(model)
pred = inference_xgboost(models, test[features])

# Check CV

In [None]:
# near_ids = train[[f"near_id_{i}" for i in range(CFG.n_neighbors)]].values

# matches = []
# for id, ps, ids in tqdm(zip(train["id"], oof, near_ids)):
#     idx = np.argmax(ps)
#     if idx > 0 and ids[idx]==ids[idx]:
#         matches.append(id + " " + ids[idx])
#     else:
#         matches.append(id)
# train["matches"] = matches
# print(f"CV: {get_score(train):.6f}")

In [None]:
near_ids = test[[f"near_id_{i}" for i in range(CFG.n_neighbors)]].values

matches = []
for id, ps, ids in tqdm(zip(test["id"], pred, near_ids)):
    idx = np.argmax(ps)
    if idx > 0 and ids[idx]==ids[idx]:
        matches.append(id + " " + ids[idx])
    else:
        matches.append(id)
test["matches"] = matches

# Check Feature Importances

In [None]:
# def plot_importances(models):
#     importance_df = pd.DataFrame(models[0].feature_importances_, 
#                                  index=features, 
#                                  columns=['importance'])\
#                         .sort_values("importance", ascending=False)

#     plt.subplots(figsize=(len(features) // 4, 5))
#     plt.bar(importance_df.index, importance_df.importance)
#     plt.grid()
#     plt.xticks(rotation=90)
#     plt.ylabel("importance")
#     plt.tight_layout()
#     plt.savefig(f'{CFG.save_dir}/feature_importants.png')
#     plt.show()

# plot_importances(models)

# Simple Post-Processing

In [None]:
def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in tqdm(df["matches"]):
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    return df 

# train = postprocess(train)
test = postprocess(test)
# print(f"CV: {get_score(train):.6f}")

# Submit

In [None]:
ssub = pd.read_csv("../input/foursquare-location-matching/sample_submission.csv")
ssub = ssub.drop(columns="matches")
ssub = ssub.merge(test[["id", "matches"]], on="id")
ssub.to_csv(f"{CFG.save_dir}/submission.csv", index=False)

ssub.head()