In [1]:
## Imports
import warnings
#warnings.filterwarnings('ignore')

# import Kaggle API to load dataset
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile

import os
import gc
import string
import time
import random as rnd
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
import numpy as np
#import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Flag to force to reload dataset
RELOAD = False

In [3]:
# initialize Kaggle API
api = KaggleApi()
api.authenticate()

# download dataset from Kaggle to data folder
data_path = 'data'
api.competition_download_files('foursquare-location-matching', data_path, force=RELOAD, quiet=False)
# save filename: !ATTENTION! : it may not be wroking if many files are in folders
# then just name it manually 
dataset_file_name = os.listdir(data_path)[0]

foursquare-location-matching.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
# Read train dataset (train.csv) to pandas DataFrame named df: it will be used for analysis
df = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('train.csv'))

df_pairs = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('pairs.csv'))

# Read test dataset (test.csv), to pandas DataFrame named df_validation. It will be used only to generate final predictions, which will be submitted
df_validation = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('test.csv'))
# finally, we will download example of submission (there are no correct predictions there, it is just an example)
df_subm_example = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('sample_submission.csv'))

In [5]:
SEED = 2022
num_neighbors = 20
num_split = 5
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']

vec_columns = ['name', 'address', 'city', 'state', 'zip', 'country', 'categories']

def seed_everything(seed):
    rnd.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [6]:
# Check, that all dataframes are loaded and have correct shapes
print(f'Shape of df: {str(df.shape)}')
print(f'Shape of df_pairs: {str(df_pairs.shape)}')
print(f'Shape of df_validation: {str(df_validation.shape)}')
print(f'Shape of df_subm_example: {str(df_subm_example.shape)}')

Shape of df: (1138812, 13)
Shape of df_pairs: (578907, 25)
Shape of df_validation: (5, 12)
Shape of df_subm_example: (5, 2)


Some ideas and helper functions are from https://www.kaggle.com/code/guoyonfan/training-data-for-binary-lgb-baseline-0-834

Dataset is quiet big, so we can split it to train-test-validation datasets in 2/5-2/5-1/5. df_validation, provided by Kaggle will be ignored in this notebook

In [7]:
df_train, df_test = train_test_split(df, test_size=0.6)
df_test, df_val  = train_test_split(df_test, test_size=(1/3))
print(f'Shape of df_train: {str(df_train.shape)}')
print(f'Shape of df_test: {str(df_test.shape)}')
print(f'Shape of df_val: {str(df_val.shape)}')

Shape of df_train: (455524, 13)
Shape of df_test: (455525, 13)
Shape of df_val: (227763, 13)


Create a pipeline for data preparation

In [8]:
def clean_string(text):
    # zip field, sometimes is read as float
    if not isinstance(text,str):
        text = str(int(text))
    text = ''.join([word for word in text if word not in string.punctuation])
    text = text.lower()

    return text

In [9]:
class ColDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop=['phone', 'url']):
        self.columns_to_drop = columns_to_drop
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(self.columns_to_drop, axis=1)

In [10]:
class CleanString(BaseEstimator, TransformerMixin):
    def __init__(self, coulums_to_clean=['name', 'address', 'city', 'state', 'zip', 'country', 'categories']):
        self.columns_to_clean = coulums_to_clean
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for col in self.columns_to_clean:
            X[f'{col}_clean']=X[col].map(clean_string, na_action='ignore')

        return X

In [11]:
class VecString(BaseEstimator, TransformerMixin):
    def __init__(self, coulums_to_vec=['name', 'address', 'city', 'state', 'zip', 'country', 'categories']):
        self.coulums_to_vec = coulums_to_vec
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for col in self.coulums_to_vec:
            tfidf = TfidfVectorizer(max_features=40)
            tv_fit = tfidf.fit_transform(X[f'{col}_clean'].fillna('nan'))
            X[f'{col}_vec'] = list(tv_fit.toarray())

        return X

Train (and also test set, which will be used for final evaluation) set is too large to combine all entries with each other and check if it refers to the same POI or not.
Therefore perform unsupervised KNN by geo location, address and name

In [12]:
def recall_knn(df, Neighbors = 5):   
    neighbors = min(len(df), Neighbors)
    train_df = []
    knn = NearestNeighbors(n_neighbors = neighbors)
    knn.fit(df[['latitude','longitude']])
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(neighbors):            
        cur_df = df[['id']].copy()
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    
    return train_df

In [13]:
def recall_knn_text(df, Neighbors = 5):   
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df['name_vec'])
    dists, nears = knn.kneighbors(df['name_vec'])
    
    for k in range(Neighbors):            
        cur_df = df[['id']].copy()
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    
    return train_df

In [14]:
class Knn_geo(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = recall_knn(X)

        return X

In [15]:
class Knn_text(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = recall_knn_text(X)

        return X

In [16]:
def combine_records(df):  
    
    df_knn = recall_knn(df)
      
    merged_df = df_knn.merge(df, how='inner', left_on='id', right_on='id')
    df_pairs_custom = merged_df.merge(df, how='inner', left_on='match_id', right_on='id')
    df_pairs_custom.drop(['match_id'], axis=1, inplace=True)
    train = ('point_of_interest' in df.columns)
    if train:
        df_pairs_custom['match'] = df_pairs_custom['point_of_interest_x'] == df_pairs_custom['point_of_interest_y']
        df_pairs_custom.drop(['point_of_interest_x', 'point_of_interest_y'], axis=1, inplace=True)

    # df_pairs_custom.drop(['name_clean_x', 'address_clean_x', 'city_clean_x', 'state_clean_x', 'zip_clean_x', 'country_clean_x', 'categories_clean_x',
    #                         'name_clean_y', 'address_clean_y', 'city_clean_y', 'state_clean_y', 'zip_clean_y', 'country_clean_y', 'categories_clean_y'], axis=1, inplace=True)

    # columns = ['id_1', 'geo_k_dist', 'geo_k_neigh', 'latitude_1', 'longitude_1', 'name_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'categories_1',
    #                                 'name_vec_1', 'address_vec_1', 'city_vec_1', 'state_vec_1', 'zip_vec_1', 'country_vec_1', 'categories_vec_1',
    #                                 'id_2', 'latitude_2', 'longitude_2', 'name_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'categories_2',
    #                                 'name_vec_2', 'address_vec_2', 'city_vec_2', 'state_vec_2', 'zip_vec_2', 'country_vec_2', 'categories_vec_2']

    columns = ['id_1', 'geo_k_dist', 'geo_k_neigh', 'latitude_1', 'longitude_1', 'name_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'categories_1',
                                    'id_2', 'latitude_2', 'longitude_2', 'name_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'categories_2']

    if train: columns.append('match')
    df_pairs_custom.columns=columns

    return df_pairs_custom

In [17]:
class CombinePairs(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = combine_records(X)

        return X

In [18]:
%load_ext Cython

In [19]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [20]:
def cosine_sim (vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)

    return cosine_similarity(vec1, vec2)[0][0]

In [21]:
def add_features(df, str_cols=['name', 'address', 'city', 'state', 'zip', 'country', 'categories']):
    df_new = df.copy()
    for col in str_cols:
        # Add string distances to df
        df_new[f'{col}_lev'] = df_new.apply(lambda x: Levenshtein.distance(str(x[f'{col}_1']), str(x[f'{col}_2'])), axis=1)
        df_new[f'{col}_jaro'] = df_new.apply(lambda x: Levenshtein.jaro_winkler(str(x[f'{col}_1']), str(x[f'{col}_2'])), axis=1)
        df_new[f'{col}_seq_match'] = df_new.apply(lambda x: difflib.SequenceMatcher(None, str(x[f'{col}_1']), str(x[f'{col}_2'])).ratio(), axis=1)
        df_new[f'{col}_lcs'] = df_new.apply(lambda x: LCS(str(x[f'{col}_1']), str(x[f'{col}_2'])), axis=1)

        # Vector distances
        #df_new[f'{col}_cos_sim'] = df_new.apply(lambda x: cosine_sim(x[f'{col}_vec_1'], x[f'{col}_vec_2']), axis=1)

    # Drop unnecessary columns
    # df_new.drop(['latitude_1', 'longitude_1', 'name_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'categories_1', 
    #             'name_vec_1', 'address_vec_1', 'city_vec_1', 'state_vec_1', 'zip_vec_1', 'country_vec_1', 'categories_vec_1', 
    #             'latitude_2', 'longitude_2', 'name_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'categories_2', 
    #             'name_vec_2', 'address_vec_2', 'city_vec_2', 'state_vec_2', 'zip_vec_2', 'country_vec_2', 'categories_vec_2'], axis=1, inplace=True)

    df_new.drop(['latitude_1', 'longitude_1', 'name_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'categories_1',  
                'latitude_2', 'longitude_2', 'name_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'categories_2'], axis=1, inplace=True)                

    return df_new

In [22]:
class AddFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = add_features(X)

        return X

In [23]:
pipe = Pipeline([
    ('dropper1', ColDropper()),
    ('cleaner', CleanString()),
    #('vector', VecString()),
    ('dropper2', ColDropper(columns_to_drop=['name', 'address', 'city', 'state', 'zip', 'country', 'categories'])),
    ('combinator', CombinePairs()),
    ('add_features', AddFeatures())
])

In [24]:
df_train_prep = pipe.fit_transform(df_train)
df_train_prep.head()

Unnamed: 0,id_1,geo_k_dist,geo_k_neigh,id_2,match,name_lev,name_jaro,name_seq_match,name_lcs,address_lev,...,zip_seq_match,zip_lcs,country_lev,country_jaro,country_seq_match,country_lcs,categories_lev,categories_jaro,categories_seq_match,categories_lcs
0,E_ed3ebf16f35c42,0.0,0,E_ed3ebf16f35c42,True,0,1.0,1.0,23,0,...,1.0,4,0,1.0,1.0,2,0,1.0,1.0,7
1,E_8c6f6bb7818f9f,1.03604,3,E_ed3ebf16f35c42,False,18,0.60398,0.378378,8,12,...,0.0,0,0,1.0,1.0,2,7,0.0,0.0,0
2,E_010b0c730f3f37,1.37766,4,E_ed3ebf16f35c42,False,18,0.504047,0.176471,5,14,...,0.0,0,0,1.0,1.0,2,8,0.417989,0.125,1
3,E_de36268a1b5cfb,0.019107,2,E_ed3ebf16f35c42,False,25,0.611736,0.339623,9,3,...,0.5,2,0,1.0,1.0,2,9,0.448413,0.210526,3
4,E_d1694abd2949b2,0.005195,3,E_ed3ebf16f35c42,False,19,0.407469,0.388889,7,14,...,0.0,0,0,1.0,1.0,2,14,0.423903,0.25,4


In [25]:
df_test_prep = pipe.fit_transform(df_test)
df_test_prep.head()

Unnamed: 0,id_1,geo_k_dist,geo_k_neigh,id_2,match,name_lev,name_jaro,name_seq_match,name_lcs,address_lev,...,zip_seq_match,zip_lcs,country_lev,country_jaro,country_seq_match,country_lcs,categories_lev,categories_jaro,categories_seq_match,categories_lcs
0,E_e1c1e1ccc57cd6,0.0,0,E_e1c1e1ccc57cd6,True,0,1.0,1.0,9,0,...,1.0,3,0,1.0,1.0,2,0,1.0,1.0,20
1,E_c74bdc3b3c735f,0.001259,2,E_e1c1e1ccc57cd6,False,7,0.518519,0.266667,2,3,...,1.0,3,0,1.0,1.0,2,16,0.519444,0.228571,5
2,E_e1c1e1ccc57cd6,0.001156,1,E_2ff76cd6dd345a,False,9,0.433333,0.210526,2,3,...,1.0,3,0,1.0,1.0,2,6,0.736988,0.761905,16
3,E_5397628843ad6a,0.000235,1,E_2ff76cd6dd345a,False,13,0.547222,0.230769,3,0,...,1.0,3,0,1.0,1.0,2,16,0.572872,0.333333,7
4,E_c74bdc3b3c735f,0.001464,3,E_2ff76cd6dd345a,False,9,0.422222,0.125,1,0,...,1.0,3,0,1.0,1.0,2,18,0.465657,0.216216,4


In [26]:
df_val_prep = pipe.fit_transform(df_val)
df_val_prep.head()

Unnamed: 0,id_1,geo_k_dist,geo_k_neigh,id_2,match,name_lev,name_jaro,name_seq_match,name_lcs,address_lev,...,zip_seq_match,zip_lcs,country_lev,country_jaro,country_seq_match,country_lcs,categories_lev,categories_jaro,categories_seq_match,categories_lcs
0,E_3a797712ed240d,0.0,0,E_3a797712ed240d,True,0,1.0,1.0,40,0,...,1.0,5,0,1.0,1.0,2,0,1.0,1.0,18
1,E_6467f7e0ae52de,0.015526,4,E_3a797712ed240d,False,33,0.4,0.214286,8,18,...,1.0,5,0,1.0,1.0,2,16,0.435185,0.272727,3
2,E_8a83270d4fe433,0.012368,3,E_3a797712ed240d,False,37,0.348148,0.081633,3,13,...,1.0,5,0,1.0,1.0,2,12,0.523148,0.470588,8
3,E_f2351063814e92,0.013139,3,E_3a797712ed240d,False,33,0.541532,0.20339,9,13,...,1.0,5,0,1.0,1.0,2,32,0.523403,0.266667,11
4,E_7c18753a42b8a1,0.004868,3,E_3a797712ed240d,False,34,0.465476,0.185185,7,9,...,1.0,5,0,1.0,1.0,2,15,0.575926,0.30303,5


In [27]:
# To test how it works with final validation without POI
df_validation_prep = pipe.fit_transform(df_validation)
df_validation_prep.head()

Unnamed: 0,id_1,geo_k_dist,geo_k_neigh,id_2,name_lev,name_jaro,name_seq_match,name_lcs,address_lev,address_jaro,...,zip_seq_match,zip_lcs,country_lev,country_jaro,country_seq_match,country_lcs,categories_lev,categories_jaro,categories_seq_match,categories_lcs
0,E_00001118ad0191,0.0,0,E_00001118ad0191,0,1.0,1.0,23,0,1.0,...,1.0,3,0,1.0,1.0,2,0,1.0,1.0,5
1,E_000020eb6fed40,184.531619,2,E_00001118ad0191,18,0.546552,0.352941,6,12,0.470085,...,0.0,0,2,0.0,0.0,0,3,0.633333,0.444444,2
2,E_00002f98667edf,74.052538,1,E_00001118ad0191,21,0.431159,0.148148,2,16,0.0,...,1.0,3,2,0.0,0.0,0,3,0.633333,0.444444,2
3,E_001b6bad66eb98,9.086563,2,E_00001118ad0191,29,0.587754,0.354839,12,0,1.0,...,1.0,3,2,0.0,0.0,0,6,0.55,0.153846,2
4,E_0283d9f61e569d,9.090103,2,E_00001118ad0191,21,0.587819,0.382979,10,25,0.59127,...,0.0,0,2,0.0,0.0,0,12,0.344444,0.1,3


In [29]:
if not os.path.exists('.\\data\\custom_prep\\'):
    os.makedirs('.\\data\\custom_prep\\')

In [30]:
df_train_prep.to_csv('.\\data\\custom_prep\\train.csv', index=False)

In [31]:
df_test_prep.to_csv('.\\data\\custom_prep\\test.csv', index=False)

In [32]:
df_val_prep.to_csv('.\\data\\custom_prep\\val.csv', index=False)

In [33]:
df_validation_prep.to_csv('.\\data\\custom_prep\\final_validation.csv', index=False)