In [1]:
## Imports
import warnings
#warnings.filterwarnings('ignore')

# import Kaggle API to load dataset
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile

import os
import gc
import string
import time
import random as rnd
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
import numpy as np
#import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Flag to force to reload dataset
RELOAD = False

In [3]:
# initialize Kaggle API
api = KaggleApi()
api.authenticate()

# download dataset from Kaggle to data folder
data_path = 'data'
api.competition_download_files('foursquare-location-matching', data_path, force=RELOAD, quiet=False)
# save filename: !ATTENTION! : it may not be wroking if many files are in folders
# then just name it manually 
dataset_file_name = "foursquare-location-matching.zip"

foursquare-location-matching.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
# Read train dataset (train.csv) to pandas DataFrame named df: it will be used for analysis
df = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('train.csv'))

df_pairs = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('pairs.csv'))

# Read test dataset (test.csv), to pandas DataFrame named df_validation. It will be used only to generate final predictions, which will be submitted
df_validation = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('test.csv'))
# finally, we will download example of submission (there are no correct predictions there, it is just an example)
df_subm_example = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('sample_submission.csv'))

In [5]:
SEED = 2022
num_neighbors = 20
num_split = 5
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']

vec_columns = ['name', 'address', 'city', 'state', 'zip', 'country', 'categories']

def seed_everything(seed):
    rnd.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [6]:
# Check, that all dataframes are loaded and have correct shapes
print(f'Shape of df: {str(df.shape)}')
print(f'Shape of df_pairs: {str(df_pairs.shape)}')
print(f'Shape of df_validation: {str(df_validation.shape)}')
print(f'Shape of df_subm_example: {str(df_subm_example.shape)}')

Shape of df: (1138812, 13)
Shape of df_pairs: (578907, 25)
Shape of df_validation: (5, 12)
Shape of df_subm_example: (5, 2)


In [7]:
all_ids = list(df_pairs['id_1'])
all_ids.extend(list(df_pairs['id_2']))

In [8]:
len(all_ids)

1157814

In [9]:
all_ids = list(dict.fromkeys(all_ids))

In [10]:
len(all_ids)

1008661

In [11]:
df_out = df_pairs[df_pairs['match']][['id_1', 'id_2']]
#df_out_2 = df_pairs[df_pairs['match']][['id_1', 'id_2']]

In [12]:
df_out.shape

(398786, 2)

In [13]:
df_subm = df_out.groupby('id_1')['id_2'].apply(list).reset_index(name='matches')

In [14]:
def convert(lst): 
    return ' '.join(lst)

df_subm['match_id'] = df_subm['matches'].apply(lambda x: convert(x))
df_subm.drop('matches', inplace=True, axis=1)

In [15]:
df_subm

Unnamed: 0,id_1,match_id
0,E_000001272c6c5d,E_da7fa3963561f8
1,E_000023d8f4be44,E_12453effe251db
2,E_00007dcd2bb53f,E_f131dcb7f07be9
3,E_0000c566a81ea1,E_8d58f3151bae83
4,E_0000d9e584ed9f,E_caad79f6ed7c44
...,...,...
314980,E_ffff2b8abf31ab,E_3798ed1302222c
314981,E_ffff7b1a22e81b,E_fb8ac113943b2f
314982,E_ffff80f94b2fee,E_75feaa1e0321cc
314983,E_ffff989ae206f8,E_a5bc397a4eaeff


In [16]:
df_subm.to_csv('.\\data\\test_my_submission.csv', index=False, doublequote=False)