In [1]:
# Flag to force to reload dataset
RELOAD = False

In [2]:
import os

# import Kaggle API to load dataset
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

# initialize Kaggle API
api = KaggleApi()
api.authenticate()

# download dataset from Kaggle to data folder
data_path = 'data'
api.competition_download_files('foursquare-location-matching', data_path, force=RELOAD, quiet=False)
# save filename: !ATTENTION! : it may not be wroking if many files are in folders
# then just name it manually 
dataset_file_name = os.listdir(data_path)[0]

foursquare-location-matching.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# import libraries to work with paths and to read zipped file, as was downloaded from Kaggle
from zipfile import ZipFile
# import pandas for EDA
import pandas as pd

# Read train dataset (train.csv) to pandas DataFrame named df: it will be used for analysis
df = pd.read_csv(ZipFile(os.path.join(data_path, dataset_file_name)).open('train.csv'))


In [4]:
# Check, that all dataframes are loaded and have correct shapes
print(f'Shape of df: {str(df.shape)}')

Shape of df: (1138812, 13)


In [5]:
# Sample 10 000 records, it should give 100 000 000 records
# Sample 1 000 records, it should give 1 000 000 records
# 100 samples = 25 seconds
# 500 samples = 100 minutes
# numbers above are for iteration
# Bellow for merging
# 100 samples = < 1 second
# 500 samples = < 1 second
# 1000 samples = 1.5 seconds + 15 seconds to save
# 10 000 samples = 30 seconds product + MemoryError: Unable to allocate 17.9 GiB for an array with shape (24, 100000000) and data type object after 5 minutes
# 5000 samples = 10 seconds product + 1m45s merging + 6 minutes to save (7 Gb file)
# 7500 samples = 10 seconds product + 8m merging + 15 minutes to save (16 Gb file)
df_sample = df.sample(7500, random_state=42)

In [6]:
from itertools import product

combo_df = pd.DataFrame(data=list(product(df_sample['id'], df_sample['id'])), columns=['id_1','id_2'])
#list(product(df_sample['id'], df_sample['id']))

In [7]:
combo_df.shape

(56250000, 2)

In [8]:
merged_df = combo_df.merge(df_sample, how='inner', left_on='id_1', right_on='id')
df_pairs_custom = merged_df.merge(df_sample, how='inner', left_on='id_2', right_on='id')
df_pairs_custom.drop(['id_1', 'id_2'], axis=1, inplace=True)
df_pairs_custom['match'] = df_pairs_custom['point_of_interest_x'] == df_pairs_custom['point_of_interest_y']
df_pairs_custom.drop(['point_of_interest_x', 'point_of_interest_y'], axis=1, inplace=True)
df_pairs_custom.columns=['id_1', 'name_1', 'latitude_1', 'longitude_1', 'address_1', 'city_1', 'state_1', 'zip_1', 'country_1', 'url_1', 'phone_1', 'categories_1',
                                'id_2', 'name_2', 'latitude_2', 'longitude_2', 'address_2', 'city_2', 'state_2', 'zip_2', 'country_2', 'url_2', 'phone_2', 'categories_2', 'match']

In [9]:
# save to csv after ~60 minutes only 144092 records are done (out of 1000000)
df_pairs_custom.to_csv('.\\data\\pairs_custom.csv', index=False)

In [10]:
pd.set_option('display.max_columns', None)
df_pairs_custom.head()

Unnamed: 0,id_1,name_1,latitude_1,longitude_1,address_1,city_1,state_1,zip_1,country_1,url_1,phone_1,categories_1,id_2,name_2,latitude_2,longitude_2,address_2,city_2,state_2,zip_2,country_2,url_2,phone_2,categories_2,match
0,E_9c455c75918751,Manjabal 2,-6.71209,108.561016,Jalan Karanggetas,Cirebon,Jawa Barat,,ID,,,Indonesian Restaurants,E_9c455c75918751,Manjabal 2,-6.71209,108.561016,Jalan Karanggetas,Cirebon,Jawa Barat,,ID,,,Indonesian Restaurants,True
1,E_f96f973047a099,Rüstem Paşa (Ulu) Camii,40.359018,30.012008,Camikebir Mh. Sakarya Cd.,Osmaneli,Bilecik,,TR,,,"Mosques, Historic Sites",E_9c455c75918751,Manjabal 2,-6.71209,108.561016,Jalan Karanggetas,Cirebon,Jawa Barat,,ID,,,Indonesian Restaurants,False
2,E_b16dafc319e90a,Starbucks,27.870459,-82.76156,10809 Starkey Rd,Largo,FL,33777,US,http://www.starbucks.com/store/1018298,(727) 392-3435,Coffee Shops,E_9c455c75918751,Manjabal 2,-6.71209,108.561016,Jalan Karanggetas,Cirebon,Jawa Barat,,ID,,,Indonesian Restaurants,False
3,E_0be920c4d49631,Refter Sint Tarcisiusinstituut,50.832716,5.102211,,,,,BE,,,"High Schools, Cafeterias",E_9c455c75918751,Manjabal 2,-6.71209,108.561016,Jalan Karanggetas,Cirebon,Jawa Barat,,ID,,,Indonesian Restaurants,False
4,E_10d09c32c73bb3,Wetherspoon Express,51.890014,0.262341,"Satellite 2, near Gate 31",Stansted,Essex,CM24 1QW,GB,http://www.jdwetherspoon.co.uk,1279669040,Pubs,E_9c455c75918751,Manjabal 2,-6.71209,108.561016,Jalan Karanggetas,Cirebon,Jawa Barat,,ID,,,Indonesian Restaurants,False


In [11]:
df_pairs_custom.shape

(56250000, 25)