In [1]:
## Imports
import warnings
#warnings.filterwarnings('ignore')

# import Kaggle API to load dataset
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile

import os
import gc
import string
import time
import random as rnd
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
import numpy as np
#import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Flag to force to reload dataset
RELOAD = False

In [3]:
# initialize Kaggle API
api = KaggleApi()
api.authenticate()

# download dataset from Kaggle to data folder
data_path = 'data'
api.competition_download_files('foursquare-location-matching', data_path, force=RELOAD, quiet=False)
# save filename: !ATTENTION! : it may not be wroking if many files are in folders
# then just name it manually 
dataset_file_name = os.listdir(data_path)[0]

foursquare-location-matching.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
# Read train dataset (train.csv) to pandas DataFrame named df: it will be used for analysis
df = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('train.csv'))

df_pairs = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('pairs.csv'))

# Read test dataset (test.csv), to pandas DataFrame named df_validation. It will be used only to generate final predictions, which will be submitted
df_validation = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('test.csv'))
# finally, we will download example of submission (there are no correct predictions there, it is just an example)
df_subm_example = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('sample_submission.csv'))

In [5]:
SEED = 2022
num_neighbors = 20
num_split = 5
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']

vec_columns = ['name', 'address', 'city', 'state', 'zip', 'country', 'categories']

def seed_everything(seed):
    rnd.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [6]:
# Check, that all dataframes are loaded and have correct shapes
print(f'Shape of df: {str(df.shape)}')
print(f'Shape of df_pairs: {str(df_pairs.shape)}')
print(f'Shape of df_validation: {str(df_validation.shape)}')
print(f'Shape of df_subm_example: {str(df_subm_example.shape)}')

Shape of df: (1138812, 13)
Shape of df_pairs: (578907, 25)
Shape of df_validation: (5, 12)
Shape of df_subm_example: (5, 2)


Some ideas and helper functions are from https://www.kaggle.com/code/guoyonfan/training-data-for-binary-lgb-baseline-0-834

Dataset is quiet big, so we can split it to train-test-validation datasets in 2/5-2/5-1/5. df_validation, provided by Kaggle will be ignored in this notebook

In [7]:
df_train, df_test = train_test_split(df, test_size=0.6)
df_test, df_val  = train_test_split(df_test, test_size=(1/3))
print(f'Shape of df_train: {str(df_train.shape)}')
print(f'Shape of df_test: {str(df_test.shape)}')
print(f'Shape of df_val: {str(df_val.shape)}')

Shape of df_train: (455524, 13)
Shape of df_test: (455525, 13)
Shape of df_val: (227763, 13)


Create a pipeline for data preparation

In [8]:
def clean_string(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    text = text.lower()

    return text

In [9]:
class ColDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop=['phone', 'url']):
        self.columns_to_drop = columns_to_drop
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(self.columns_to_drop, axis=1)

In [10]:
class CleanString(BaseEstimator, TransformerMixin):
    def __init__(self, coulums_to_clean=['name', 'address', 'city', 'state', 'zip', 'country', 'categories']):
        self.columns_to_clean = coulums_to_clean
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for col in self.columns_to_clean:
            X[f'{col}_clean']=X[col].map(clean_string, na_action='ignore')

        return X

In [11]:
class VecString(BaseEstimator, TransformerMixin):
    def __init__(self, coulums_to_vec=['name', 'address', 'city', 'state', 'zip', 'country', 'categories']):
        self.coulums_to_vec = coulums_to_vec
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for col in self.coulums_to_vec:
            tfidf = TfidfVectorizer(max_features=40)
            tv_fit = tfidf.fit_transform(X[f'{col}_clean'].fillna('nan'))
            X[f'{col}_vec'] = list(tv_fit.toarray())

        return X

In [12]:
pipe = Pipeline([
    ('dropper1', ColDropper()),
    ('cleaner', CleanString()),
    ('vector', VecString()),
    ('dropper2', ColDropper(columns_to_drop=['name', 'address', 'city', 'state', 'zip', 'country', 'categories', ]))
])

Train (and also test set, which will be used for final evaluation) set is too large to combine all entries with each other and check if it refers to the same POI or not.
Therefore perform unsupervised KNN by geo location, address and name

In [13]:
def recall_knn(df, Neighbors = 20):   
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']])
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(Neighbors):            
        cur_df = df[['id']].copy()
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    
    return train_df

In [14]:
def recall_knn_text(df, Neighbors = 5):   
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df['name_vec'])
    dists, nears = knn.kneighbors(df['name_vec'])
    
    for k in range(Neighbors):            
        cur_df = df[['id']].copy()
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    
    return train_df

In [27]:
df_prep['name_vec']

1055750    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
780238     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
506192     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
520679     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1063974    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                                 ...                        
439985     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
103024     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
147629     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
787676     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
349053     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: name_vec, Length: 455524, dtype: object

In [22]:
knn = NearestNeighbors(n_neighbors = 5)
knn.fit(df_prep['name_vec'])

ValueError: setting an array element with a sequence.

In [20]:
recall_knn_text(df_prep)

ValueError: setting an array element with a sequence.

In [15]:
class Knn_geo(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = recall_knn(X)

        return X

In [16]:
class Knn_text(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = recall_knn_text(X)

        return X

In [17]:
df_prep = pipe.fit_transform(df_train)

In [18]:
knn = Knn_text()
df_gg = knn.fit_transform(df_prep)

ValueError: setting an array element with a sequence.

In [72]:
df_gg

Unnamed: 0,id,match_id,kdist,kneighbors
1055750,E_ed3ebf16f35c42,E_ed3ebf16f35c42,0.000000,0
780238,E_af3ef9bee43f40,E_af3ef9bee43f40,0.000000,0
506192,E_71cea3ad594515,E_71cea3ad594515,0.000000,0
520679,E_750514b6e6e017,E_750514b6e6e017,0.000000,0
1063974,E_ef1fe13019c915,E_ef1fe13019c915,0.000000,0
...,...,...,...,...
439985,E_62e1a6591b01a6,E_b9907cab18946a,0.027444,19
103024,E_1717c95d5362f8,E_d543a545d2a00d,0.001729,19
147629,E_2119fa3c018934,E_3d7d3339546df6,0.007228,19
787676,E_b0ef314bd0fd21,E_60d747b2ef252f,0.213763,19


In [73]:
df_train[df_train['id'].isin(['E_4e7036834a8133', 'E_e062dace84aeed'])]

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest
998866,E_e062dace84aeed,ProStor,50.388474,30.495232,"Голосіївський проспект, 106/2",Київ,м. Київ,3127,UA,http://prostor.ua,800600200,Cosmetics Shops,P_1820c0bcc6de60
349053,E_4e7036834a8133,Велика Кишеня,50.396529,30.504347,"вул. Васильківська, 8",Київ,м. Київ,3040,UA,http://kishenya.ua,380800501505,Supermarkets,P_78b4061776e49a


In [13]:
%load_ext Cython

In [14]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [15]:
def add_features(df):    
    for col in tqdm(feat_columns):       
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df