In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile

import os
import string
import random as rnd
import Levenshtein
import difflib
import pandas as pd
import numpy as np
#import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Flag to force to reload dataset
RELOAD = False

In [3]:
# initialize Kaggle API
api = KaggleApi()
api.authenticate()

# download dataset from Kaggle to data folder
data_path = 'data'
api.competition_download_files('foursquare-location-matching', data_path, force=RELOAD, quiet=False)
# save filename: !ATTENTION! : it may not be wroking if many files are in folders
# then just name it manually 
dataset_file_name = 'foursquare-location-matching.zip'

foursquare-location-matching.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
# Read train dataset (train.csv) to pandas DataFrame named df: it will be used for analysis
df = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('train.csv'))

df_pairs = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('pairs.csv'))

# Read test dataset (test.csv), to pandas DataFrame named df_validation. It will be used only to generate final predictions, which will be submitted
df_validation = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('test.csv'))
# finally, we will download example of submission (there are no correct predictions there, it is just an example)
df_subm_example = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('sample_submission.csv'))

In [5]:
SEED = 2022
num_neighbors = 20
num_split = 5
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']

vec_columns = ['name', 'address', 'city', 'state', 'zip', 'country', 'categories']

def seed_everything(seed):
    rnd.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [6]:
# Check, that all dataframes are loaded and have correct shapes
print(f'Shape of df: {str(df.shape)}')
print(f'Shape of df_pairs: {str(df_pairs.shape)}')
print(f'Shape of df_validation: {str(df_validation.shape)}')
print(f'Shape of df_subm_example: {str(df_subm_example.shape)}')

Shape of df: (1138812, 13)
Shape of df_pairs: (578907, 25)
Shape of df_validation: (5, 12)
Shape of df_subm_example: (5, 2)


In [7]:
def clean_string(text):
    # zip field, sometimes is read as float
    if not isinstance(text,str):
        text = str(int(text))
    text = ''.join([word for word in text if word not in string.punctuation])
    text = text.lower()

    return text

In [8]:
class ColDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop=['phone', 'url']):
        self.columns_to_drop = columns_to_drop
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(self.columns_to_drop, axis=1)

In [9]:
class CleanString(BaseEstimator, TransformerMixin):
    def __init__(self, coulums_to_clean=['name', 'address', 'city', 'state', 'zip', 'country', 'categories']):
        self.columns_to_clean = coulums_to_clean
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for col in self.columns_to_clean:
            X[f'{col}_clean']=X[col].map(clean_string, na_action='ignore')

        return X

In [10]:
def recall_knn(df, Neighbors = 5):   
    neighbors = min(len(df), Neighbors)
    train_df = []
    knn = NearestNeighbors(n_neighbors = neighbors)
    knn.fit(df[['latitude','longitude']])
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(neighbors):            
        cur_df = df[['id']].copy()
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    
    return train_df

In [11]:
class Knn_geo(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = recall_knn(X)

        return X

In [12]:
def combine_records(df):  
    
    df_knn = recall_knn(df)
      
    merged_df = df_knn.merge(df, how='inner', left_on='id', right_on='id')
    df_pairs_custom = merged_df.merge(df, how='inner', left_on='match_id', right_on='id')
    df_pairs_custom.drop(['match_id'], axis=1, inplace=True)
    # if it is train set - create match column
    train = ('point_of_interest' in df.columns)
    if train:
        df_pairs_custom['match'] = df_pairs_custom['point_of_interest_x'] == df_pairs_custom['point_of_interest_y']
        df_pairs_custom.drop(['point_of_interest_x', 'point_of_interest_y'], axis=1, inplace=True)

    # df_pairs_custom.drop(['name_clean_x', 'address_clean_x', 'city_clean_x', 'state_clean_x', 'zip_clean_x', 'country_clean_x', 'categories_clean_x',
    #                         'name_clean_y', 'address_clean_y', 'city_clean_y', 'state_clean_y', 'zip_clean_y', 'country_clean_y', 'categories_clean_y'], axis=1, inplace=True)

    # columns = ['id_1', 'geo_k_dist', 'geo_k_neigh', 'latitude_1', 'longitude_1', 'name_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'categories_1',
    #                                 'name_vec_1', 'address_vec_1', 'city_vec_1', 'state_vec_1', 'zip_vec_1', 'country_vec_1', 'categories_vec_1',
    #                                 'id_2', 'latitude_2', 'longitude_2', 'name_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'categories_2',
    #                                 'name_vec_2', 'address_vec_2', 'city_vec_2', 'state_vec_2', 'zip_vec_2', 'country_vec_2', 'categories_vec_2']

    columns = ['id_1', 'geo_k_dist', 'geo_k_neigh', 'latitude_1', 'longitude_1', 'name_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'categories_1',
                                    'id_2', 'latitude_2', 'longitude_2', 'name_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'categories_2']

    if train: columns.append('match')
    df_pairs_custom.columns=columns

    return df_pairs_custom

In [13]:
class CombinePairs(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = combine_records(X)

        return X

In [14]:
%load_ext Cython

In [15]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [16]:
def cosine_sim (vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)

    return cosine_similarity(vec1, vec2)[0][0]

In [17]:
def add_features(df, str_cols=['name', 'address', 'city', 'state', 'zip', 'country', 'categories']):
    df_new = df.copy()
    for col in str_cols:
        # Add string distances to df
        df_new[f'{col}_lev'] = df_new.apply(lambda x: Levenshtein.distance(str(x[f'{col}_1']), str(x[f'{col}_2'])), axis=1)
        df_new[f'{col}_jaro'] = df_new.apply(lambda x: Levenshtein.jaro_winkler(str(x[f'{col}_1']), str(x[f'{col}_2'])), axis=1)
        df_new[f'{col}_seq_match'] = df_new.apply(lambda x: difflib.SequenceMatcher(None, str(x[f'{col}_1']), str(x[f'{col}_2'])).ratio(), axis=1)
        df_new[f'{col}_lcs'] = df_new.apply(lambda x: LCS(str(x[f'{col}_1']), str(x[f'{col}_2'])), axis=1)

        # Vector distances
        #df_new[f'{col}_cos_sim'] = df_new.apply(lambda x: cosine_sim(x[f'{col}_vec_1'], x[f'{col}_vec_2']), axis=1)

    # Drop unnecessary columns
    # df_new.drop(['latitude_1', 'longitude_1', 'name_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'categories_1', 
    #             'name_vec_1', 'address_vec_1', 'city_vec_1', 'state_vec_1', 'zip_vec_1', 'country_vec_1', 'categories_vec_1', 
    #             'latitude_2', 'longitude_2', 'name_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'categories_2', 
    #             'name_vec_2', 'address_vec_2', 'city_vec_2', 'state_vec_2', 'zip_vec_2', 'country_vec_2', 'categories_vec_2'], axis=1, inplace=True)

    df_new.drop(['latitude_1', 'longitude_1', 'name_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'categories_1',  
                'latitude_2', 'longitude_2', 'name_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'categories_2'], axis=1, inplace=True)                

    return df_new

In [18]:
class AddFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = add_features(X)

        return X

In [19]:
pipe = Pipeline([
    ('dropper1', ColDropper()),
    ('cleaner', CleanString()),
    #('vector', VecString()),
    ('dropper2', ColDropper(columns_to_drop=['name', 'address', 'city', 'state', 'zip', 'country', 'categories'])),
    ('combinator', CombinePairs()),
    ('add_features', AddFeatures())
])

In [20]:
from catboost import CatBoostClassifier

In [21]:
cat_model = CatBoostClassifier()
cat_model.load_model('catboost2406')

<catboost.core.CatBoostClassifier at 0x2de1c0b6140>

In [22]:
df_prep = pipe.fit_transform(df_validation)

In [23]:
df_prep_no_ids = df_prep.drop(['id_1', 'id_2'], axis=1)

In [24]:
from pickle import load

In [25]:
scaler = load(open('scaler.pkl', 'rb'))

In [26]:
# Check for 'match' column and remove it if exist (for testing I use test dataset, which creates this column)
if ('match' in df_prep_no_ids.columns):
    df_prep_no_ids.drop('match', axis=1, inplace=True)

In [27]:
df_prep_no_ids_scaled = scaler.transform(df_prep_no_ids)

In [28]:
y_pred = cat_model.predict(df_prep_no_ids_scaled)

In [29]:
df_prep['match'] = (y_pred == 'True')

In [30]:
df_out = df_prep[df_prep['match']][['id_1', 'id_2']]

In [31]:
df_subm = df_out.groupby('id_1')['id_2'].apply(list).reset_index(name='matches')

In [32]:
def convert(lst): 
    return ' '.join(lst)

df_subm['match_id'] = df_subm['matches'].apply(lambda x: convert(x))
df_subm.drop('matches', inplace=True, axis=1)

df_subm.columns = ['id', 'matches']

In [33]:
df_subm.to_csv('.\\data\\test_my_submission.csv', index=False, doublequote=False)