In [6]:
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
path_parent = os.path.dirname(os.getcwd())
data_path = os.path.join(path_parent, 'src/data/LocalBusiness')

In [16]:
file = '20-12-21_ClusterFile_withCleaning'

df_in = pd.read_json(os.path.join(data_path, file), compression='gzip', orient='records', lines=True)
df_copy = df_in.copy()

In [18]:
df = df_in[df_in['clusterNtables']>=8]
df.drop(columns=['clusterNtables', 'tableNclusters'], inplace=True)

In [19]:
tmp = pd.DataFrame(df.groupby(['table_id'])['cluster_id'].nunique()).reset_index()
tmp.columns = ['table_id','tableNclusters']
df = df.merge(tmp, on='table_id', how='left').sort_values(by='tableNclusters', ascending=False)

tmp = pd.DataFrame(df.groupby(['cluster_id'])['table_id'].nunique()).reset_index()
tmp.columns = ['cluster_id','clusterNtables']
df = df.merge(tmp, on='cluster_id', how='left')

In [20]:
print(df['table_id'].nunique())
print(df['cluster_id'].nunique())

3647
6432


In [21]:
df['cluster_id'] = df.groupby('cluster_id').ngroup()
df.sort_values('cluster_id').head()

Unnamed: 0,index,row_id,table_id,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude,phone_object,E.164 format,telephoneNorm,telephone_,cluster_id,tableNclusters,clusterNtables
110005,15695.0,73,LocalBusiness_visitthy.com_September2020.json.gz,Brænderiet Limfjorden,"{'postalcode': '7870', 'addresscountry': 'DK',...",https://www.visitthy.com/thy/plan-your-trip/br...,+45 40 94 00 00,,SUNDSØREVEJ 4 SUNDSØRE HAVN,ROSLEV,DK,9.17325139045715,56.7080333615004,"{'country_code': 45, 'extension': None, 'natio...",4540940000.0,4540940000,,0,1,11
104579,3565030.0,12,LocalBusiness_danishfjordholiday.com_September...,Brænderiet Limfjorden - distillery,"{'streetaddress': 'Sundsørevej 4 null', 'posta...",https://www.danishfjordholiday.com/fjord-holid...,40940000,,SUNDSØREVEJ 4 NULL,ROSLEV,DK,9.17321563,56.70806848,"{'country_code': 45, 'extension': None, 'natio...",4540940000.0,4540940000,40940000.0,0,1,11
104580,3182177.0,61,LocalBusiness_urlaubamlimfjord.de_September202...,Brænderiet Limfjorden,"{'streetaddress': 'Sundsørevej 4 null', 'posta...",https://www.urlaubamlimfjord.de/limfjord/der-g...,40940000,,SUNDSØREVEJ 4 NULL,ROSLEV,DK,9.17324066162109,56.7083690397555,"{'country_code': 45, 'extension': None, 'natio...",4540940000.0,4540940000,40940000.0,0,1,11
104581,1594242.0,4,LocalBusiness_visitmors.com_September2020.json.gz,Brænderiet Limfjorden,"{'streetaddress': 'Sundsørevej 4 null', 'posta...",https://www.visitmors.com/mors/plan-your-trip/...,40940000,,SUNDSØREVEJ 4 NULL,ROSLEV,DK,9.17324066162109,56.7083690397555,"{'country_code': 45, 'extension': None, 'natio...",4540940000.0,4540940000,40940000.0,0,1,11
104589,3565074.0,56,LocalBusiness_danishfjordholiday.com_September...,Brænderiet Limfjorden,"{'streetaddress': 'Sundsørevej 4 null', 'posta...",https://www.danishfjordholiday.com/fjord-holid...,40940000,,SUNDSØREVEJ 4 NULL,ROSLEV,DK,9.17324066162109,56.7083690397555,"{'country_code': 45, 'extension': None, 'natio...",4540940000.0,4540940000,40940000.0,0,1,11


### Table Selection

In [20]:
# Based on some preliminary tests, my idea for table & entity selection is as follows:

# - start with top 30 tables with largets 'tableNclusters' as candidates; in each iteration, randomly select 500 entities 
#   calculate the number of cluster occurences for each table; save the results
# - redo the above step for N=10000 times, and calculate the mean of cluster occurences for each table
# - select the 15 tables with largest mean occurence. The underlying assumption is that, if a table is more likely to 
#   cover some randomly selected entities, it should have a higher probability of including the 500 selected entities
#   in the end.
# - do the above random sampling again on entities and select the 500 entities that are most likely to be included in the
#   15 tables selected from the last step.

In [23]:
# np.random.seed(42)
# table_idx = np.random.randint(0, ntables, 15)   # 15 is the number of selected tables
# table_candidates = tables[table_idx]

# cluster_idx = np.random.randint(0, ntables, 500)   # 500 is the number of selected clusters
# cluster_candidates = clusters[cluster_idx]

In [24]:
# initialize a dictionary to store cluster frequency
# cluster_fre = {key: 0 for key in range(ncluster)}

# for t in table_candidates:
#     clusters = df[df['table_id']==t]['cluster_id'].unique()
#     for c in clusters:
#         cluster_fre[c] += 1
        
# results =pd.DataFrame.from_dict(cluster_fre, orient='index', columns=['frequency']).sort_values('frequency', ascending=False)

In [25]:
# np.random.seed(42)

# sampling_tables = 50
# sampling_times = 100

# tables = np.array(df['table_id'].unique())[:sampling_tables]   
# clusters = np.array(df['cluster_id'].unique())

# array1 = np.ndarray((sampling_times, sampling_tables))
# for i in range(sampling_times):             # number of sampling attempts
#     table_candidates = tables
#     cluster_candidates = np.random.choice(clusters, 500, replace=False)   # 500 is the number of selected clusters

#     list_frequency =[]
#     for t in table_candidates:
#         l1 = set(df[df['table_id']==t].cluster_id.unique())  # clusters that are included in the table  
#         l2 = set(cluster_candidates)                         # choosen clusters

#         list_frequency.append(len(l1.intersection(l2)))      
    
#     #print(list_frequency)
#     #if mean(list_frequency) > 200:
#     #    break
#     array2 = np.append(array1, list_frequency).reshape(-1,sampling_tables)

In [26]:
# print(np.mean(array2, axis=0))

[1.13861386 1.02970297 0.77227723 0.67326733 0.65346535 0.3960396
 0.27722772 0.2970297  0.24752475 0.26732673 0.24752475 0.22772277
 0.16831683 0.18811881 0.20792079 0.20792079 0.16831683 0.11881188
 0.0990099  0.0990099  0.06930693 0.06930693 0.06930693 0.01980198
 0.10891089 0.11881188 0.10891089 0.0990099  0.02970297 0.08910891
 0.08910891 0.07920792 0.05940594 0.08910891 0.03960396 0.07920792
 0.0990099  0.02970297 0.04950495 0.05940594 0.01980198 0.03960396
 0.04950495 0.03960396 0.04950495 0.02970297 0.04950495 0.04950495
 0.05940594 0.01980198]


In [59]:
# idx = np.argpartition(np.mean(array2, axis=0), -30)[-30:]     # first narrow the tables to 30
# tables_selected = tables[idx]
# tables_selected

array(['Hotel_ihg.com_September2020.json.gz',
       'LocalBusiness_usharbors.com_September2020.json.gz',
       'Restaurant_ubereats.com_September2020.json.gz',
       'LocalBusiness_homify.com.co_September2020.json.gz',
       'LocalBusiness_acushops.com_September2020.json.gz',
       'LocalBusiness_saabshops.com_September2020.json.gz',
       'LocalBusiness_visitdenmark.com_September2020.json.gz',
       'LocalBusiness_homify.com.ar_September2020.json.gz',
       'Restaurant_opentable.com.au_September2020.json.gz',
       'LocalBusiness_infinitishops.com_September2020.json.gz',
       'Restaurant_theinfatuation.com_September2020.json.gz',
       'LocalBusiness_exoticautoshops.com_September2020.json.gz',
       'Hotel_marriott.co.uk_September2020.json.gz',
       'LocalBusiness_lexrepairshops.com_September2020.json.gz',
       'LocalBusiness_jagshops.com_September2020.json.gz',
       'LocalBusiness_homify.com.my_September2020.json.gz',
       'LocalBusiness_homify.com_September2020.

### Entity Selection

In [13]:
# select the 15 tables with largest mean occurence (15 is the number of selected tables)
# then do the above random sampling again on entities and select the 500 entity with largest mean inclusion

In [28]:
# # initialize a dictionary to store cluster frequency
# cluster_fre = {key: 0 for key in range(1, ncluster+1)}

# for t in tables_selected:
#     clusters = df[df['table_id']==t]['cluster_id'].unique()
#     for c in clusters:
#         cluster_fre[c] += 1

In [30]:
# cluster_fre
# results =pd.DataFrame.from_dict(cluster_fre, orient='index', columns=['frequency']).sort_values('frequency', ascending=False)

# clusters_selected = results[:500].index
# clusters_selected

Int64Index([2053, 9886, 9600, 2715, 2731, 9769, 8711, 8824, 6284, 3536,
            ...
            5505, 8073, 5518, 6017, 5728, 9129, 8647, 1848, 3304, 2251],
           dtype='int64', length=500)

In [34]:
# tmp = df.set_index('cluster_id').loc[clusters_selected]

In [63]:
# tables_selected2 = tables_selected[[4,5,9,11,13,14, 17, 18,20,21,22,23, 24, 25]]
# tables_selected2

array(['LocalBusiness_acushops.com_September2020.json.gz',
       'LocalBusiness_saabshops.com_September2020.json.gz',
       'LocalBusiness_infinitishops.com_September2020.json.gz',
       'LocalBusiness_exoticautoshops.com_September2020.json.gz',
       'LocalBusiness_lexrepairshops.com_September2020.json.gz',
       'LocalBusiness_jagshops.com_September2020.json.gz',
       'LocalBusiness_lrshops.com_September2020.json.gz',
       'LocalBusiness_volvomechanics.com_September2020.json.gz',
       'LocalBusiness_pcarshops.com_September2020.json.gz',
       'LocalBusiness_vcarshops.com_September2020.json.gz',
       'LocalBusiness_minirepairshops.com_September2020.json.gz',
       'LocalBusiness_benzshops.com_September2020.json.gz',
       'LocalBusiness_fourringsrepair.com_September2020.json.gz',
       'LocalBusiness_bimmershops.com_September2020.json.gz'],
      dtype=object)

In [64]:
# tmp['1']=tmp.reset_index().apply(lambda x: x['table_id'] in tables_selected2, axis=1)
# tmp2=tmp[tmp['1']==True].sort_values('clusterNtables')

# Splitting 

In [22]:
nclusters = df['cluster_id'].nunique()
tables = df['table_id'].unique()
ntables = df['table_id'].nunique()

In [23]:
# create a dictionary where the keys are table_id and values are lists of cluster_id
results = {}
for t in tables:
    results[t] = []

In [24]:
for c in range(nclusters):
    tables_c = df[df['cluster_id']==c]['table_id'].unique()
    for t in tables_c:
        results[t].append(c)

In [25]:
results2 = {}
for t in tables:
    results2[t] = np.zeros(nclusters)
    idxes = results[t]
    results2[t][idxes] = 1

In [26]:
overview = pd.DataFrame.from_dict(results2).to_numpy()
overview  # axis-1: table_ids (ordered as in the series tables); axis-0: cluster_ids 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
ntables

3647

In [35]:
# there are 1041 unique tables in total and assign the tables to 3 groups (3:2:3)
# number of tables in the train set: 391
# number of tables in the validation set: 260
# number of tables in the test set: 390

ntrain_tables = np.int(np.round(ntables * 0.375,0))
nval_tables = np.int(np.round(ntables * 0.25,0))
ntest_tables = ntables-ntrain_tables-nval_tables

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ntrain_tables = np.int(np.round(ntables * 0.375,0))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  nval_tables = np.int(np.round(ntables * 0.25,0))


### Train/Val/Test splitng

In [33]:
# target arrays
ttrain = np.sum(overview, axis=1) * 0.375
tval = np.sum(overview, axis=1) * 0.25
ttest = np.sum(overview, axis=1) * 0.375

In [37]:
np.random.seed(42)

# initialize array/list to store results
N = 10000
loss_scores = np.zeros(N)
splits = []

for n in range(N):  
    # 1) randomly select tables to form train/val/test
    # sampling for train
    candidates0 = np.arange(ntables)
    train_tables = np.random.choice(candidates0, ntrain_tables, replace=False)
    # sampling for val 
    candidates1 = np.delete(candidates0, train_tables)
    val_tables = np.random.choice(candidates1, nval_tables, replace=False)
    # (sampling) for test
    test_tables =  np.delete(candidates0, np.append(train_tables,val_tables))

    # 2) compute the realized target array
    rtrain = np.sum(overview[:, train_tables], axis=1)
    rval = np.sum(overview[:, val_tables], axis=1)
    rtest = np.sum(overview[:, test_tables], axis=1)
    
    # try best to ensure that each cluster has at least one representative in each group, 
    # penalty if one cluster is not included
    train_zeros = len(np.where(rtrain==0)[0])
    val_zeros = len(np.where(rval==0)[0])
    test_zeros = len(np.where(rtest==0)[0])
           # penalty for deviation from the target array
    loss = np.linalg.norm(rtrain - ttrain) + np.linalg.norm(rval - tval) + np.linalg.norm(rtest - ttest) + \
           train_zeros + val_zeros + test_zeros
    
    # save the results
    loss_scores[n] = loss
    splits.append((train_tables, val_tables, test_tables))
               
    # 3) repeat the above procedure N times, and select the splitting strategy with least loss
    n += 1

In [38]:
# i-th iteration with minimum loss
idx_minloss = np.where(loss_scores==np.min(loss_scores))[0][0]
print(idx_minloss)
print(loss_scores[idx_minloss])

# splitting strategy
splitting_minloss = splits[idx_minloss]
selected_train_tables = tables[splitting_minloss[0]]
selected_val_tables = tables[splitting_minloss[1]]
selected_test_tables = tables[splitting_minloss[2]]

5034
525.0793183040627


In [42]:
print('training set:')
print(tables[splitting_minloss[0]])

training set:
['LocalBusiness_seoshouston.com_September2020.json.gz'
 'Hotel_reservehotel.ir_September2020.json.gz'
 'LocalBusiness_bridaland.net_September2020.json.gz' ...
 'LocalBusiness_gosonderborg.com_September2020.json.gz'
 'LocalBusiness_visittampabay.com_September2020.json.gz'
 'LocalBusiness_websolute.org_September2020.json.gz']


In [45]:
print('validation set:')
print(tables[splitting_minloss[1]])

validation set:
['LocalBusiness_innova-ms.com_September2020.json.gz'
 'LocalBusiness_11880-heizung.com_September2020.json.gz'
 'Restaurant_viamichelin.co.uk_September2020.json.gz'
 'Hotel_appartementsbudapest.hu_September2020.json.gz'
 'LocalBusiness_thai2siam.com_September2020.json.gz'
 'LocalBusiness_lawnmowingnearme.net.au_September2020.json.gz'
 'Hotel_metropolitanhoteltelaviv.com_September2020.json.gz'
 'Restaurant_opentable.com_September2020.json.gz'
 'LocalBusiness_mobilbaru.co.id_September2020.json.gz'
 'Hotel_hotelesenbudapest.hu_September2020.json.gz'
 'LocalBusiness_directoryvilla.org_September2020.json.gz'
 'LocalBusiness_officialdirectory.co_September2020.json.gz'
 'LocalBusiness_sup-ausbildung.de_September2020.json.gz'
 'LocalBusiness_sup-sportverein.de_September2020.json.gz'
 'LocalBusiness_buyersofnewyork.com_September2020.json.gz'
 'LocalBusiness_buddylinks.org_September2020.json.gz'
 'Hotel_iransafer.com_September2020.json.gz'
 'LocalBusiness_liposuctionofnyc.com_Sept

In [44]:
print('test set:')
print(tables[splitting_minloss[2]])

test set:
['LocalBusiness_homify.com.co_September2020.json.gz'
 'LocalBusiness_homify.com.eg_September2020.json.gz'
 'LocalBusiness_homify.co.kr_September2020.json.gz' ...
 'LocalBusiness_kobodder.dk_September2020.json.gz'
 'LocalBusiness_sugarhillautocollision.com_September2020.json.gz'
 'Hotel_jadranhotelzagreb.com_September2020.json.gz']


In [47]:
splitting3_path = path_parent + "/src/data/LocalBusiness/Splitting_12.20/Train_Validation_Test"

df_train = pd.DataFrame(tables[splitting_minloss[0]])
df_train.to_csv(os.path.join(splitting3_path, 'training tables.csv'))

df_val = pd.DataFrame(tables[splitting_minloss[1]])
df_val.to_csv(os.path.join(splitting3_path, 'validation tables.csv'))

df_test = pd.DataFrame(tables[splitting_minloss[2]])
df_test.to_csv(os.path.join(splitting3_path, 'testing tables.csv'))

### Train/Test splitting

In [48]:
# there are 1041 unique tables in total and assign the tables to 3 groups (3:2:3)
# number of tables in the train set: 391
# number of tables in the validation set: 260
# number of tables in the test set: 390

ntrain_tables =  np.int(np.round(ntables * 0.727,0))
ntest_tables = ntables - ntrain_tables

# target arrays
ttrain = np.sum(overview, axis=1) * 0.727
ttest = np.sum(overview, axis=1) * 0.273

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ntrain_tables =  np.int(np.round(ntables * 0.727,0))


In [50]:
np.random.seed(42)

# initialize array/list to store results
N = 10000
loss_scores = np.zeros(N)
splits = []

for n in range(N):  
    # 1) randomly select tables to form train/val/test
    # sampling for train
    candidates0 = np.arange(ntables)
    train_tables = np.random.choice(candidates0, ntrain_tables, replace=False)
    # (sampling) for test
    test_tables =  np.delete(candidates0, train_tables)

    # 2) compute the realized target array
    rtrain = np.sum(overview[:, train_tables], axis=1)
    rtest = np.sum(overview[:, test_tables], axis=1)
    
    # try best to ensure that each cluster has at least one representative in each group, 
    # penalty if one cluster is not included
    train_zeros = len(np.where(rtrain==0)[0])
    test_zeros = len(np.where(rtest==0)[0])
           # penalty for deviation from the target array
    loss = np.linalg.norm(rtrain - ttrain) + np.linalg.norm(rtest - ttest) + \
           train_zeros + test_zeros
    
    # save the results
    loss_scores[n] = loss
    splits.append((train_tables, test_tables))
               
    # 3) repeat the above procedure N times, and select the splitting strategy with least loss
    n += 1

In [51]:
# i-th iteration with minimum loss
idx_minloss = np.where(loss_scores==np.min(loss_scores))[0][0]
print(idx_minloss)
print(loss_scores[idx_minloss])

# splitting strategy
splitting_minloss = splits[idx_minloss]
selected_train_tables = tables[splitting_minloss[0]]
selected_test_tables = tables[splitting_minloss[1]]

5736
278.7165174044451


In [52]:
tables[splitting_minloss[0]]

array(['LocalBusiness_uniquewebsitez.com_September2020.json.gz',
       'LocalBusiness_blsj.com_September2020.json.gz',
       'LocalBusiness_lagoslife.com.ng_September2020.json.gz', ...,
       'LocalBusiness_nephrocare.ch_September2020.json.gz',
       'LocalBusiness_santanvalleybusinesses.com_September2020.json.gz',
       'Hotel_bestwestern.nl_September2020.json.gz'], dtype=object)

In [53]:
np.where(np.sum(overview[:, splitting_minloss[0]], axis=1)==0)[0]

array([], dtype=int64)

In [2]:
len(np.where(np.sum(overview[:, splitting_minloss[1]], axis=1)==0)[0])

NameError: name 'np' is not defined

In [56]:
print('training set:')
print(tables[splitting_minloss[0]])

training set:
['LocalBusiness_uniquewebsitez.com_September2020.json.gz'
 'LocalBusiness_blsj.com_September2020.json.gz'
 'LocalBusiness_lagoslife.com.ng_September2020.json.gz' ...
 'LocalBusiness_nephrocare.ch_September2020.json.gz'
 'LocalBusiness_santanvalleybusinesses.com_September2020.json.gz'
 'Hotel_bestwestern.nl_September2020.json.gz']


In [57]:
print('test set:')
print(tables[splitting_minloss[1]])

test set:
['LocalBusiness_homify.ua_September2020.json.gz'
 'LocalBusiness_homify.com.ar_September2020.json.gz'
 'LocalBusiness_homify.com.ve_September2020.json.gz'
 'LocalBusiness_homify.es_September2020.json.gz'
 'LocalBusiness_homify.in_September2020.json.gz'
 'LocalBusiness_homify.com.mx_September2020.json.gz'
 'LocalBusiness_homify.ca_September2020.json.gz'
 'LocalBusiness_homify.de_September2020.json.gz'
 'LocalBusiness_homify.tw_September2020.json.gz'
 'LocalBusiness_bimmershops.com_September2020.json.gz'
 'LocalBusiness_benzshops.com_September2020.json.gz'
 'LocalBusiness_homify.ae_September2020.json.gz'
 'LocalBusiness_topsee.us_September2020.json.gz'
 'LocalBusiness_volvomechanics.com_September2020.json.gz'
 'LocalBusiness_homify.pk_September2020.json.gz'
 'LocalBusiness_homify.pe_September2020.json.gz'
 'Hotel_kayak.com_September2020.json.gz'
 'LocalBusiness_weebo.us_September2020.json.gz'
 'Hotel_marriott.com.au_September2020.json.gz'
 'LocalBusiness_bizmarks.org_September2

In [58]:
splitting2_path = path_parent + "/src/data/LocalBusiness/Splitting_12.20/Train_Test"

df_train = pd.DataFrame(tables[splitting_minloss[0]])
df_train.to_csv(os.path.join(splitting2_path, 'training tables_v2.csv'))

df_test = pd.DataFrame(tables[splitting_minloss[1]])
df_test.to_csv(os.path.join(splitting2_path, 'testing tables_v2.csv'))

### checking for size of tables

In [43]:
source_path = r"C:\Users\chench10\Downloads\-" 
file = 'New_Concatenated_MatchingFile'

df_raw = pd.read_json(os.path.join(source_path, file), compression='gzip', orient='records', lines=True)

In [52]:
tmp = df_raw[df_raw['origin'].isin(tables.tolist())]

In [64]:
tmp.groupby('origin').size().sort_values(ascending=False)[:325]

origin
Hotel_kayak.com_September2020.json.gz                         29547
LocalBusiness_2findlocal.com_September2020.json.gz            28918
LocalBusiness_birdeye.com_September2020.json.gz               18489
Restaurant_ubereats.com_September2020.json.gz                 11630
Restaurant_slicelife.com_September2020.json.gz                11136
                                                              ...  
LocalBusiness_prontowebdir.com_September2020.json.gz             53
Hotel_dayuse.nl_September2020.json.gz                            53
LocalBusiness_pearlsoftheweb.org_September2020.json.gz           52
Restaurant_visitaarhusregion.com_September2020.json.gz           51
LocalBusiness_geoparkvestjylland.com_September2020.json.gz       51
Length: 325, dtype: int64

In [1]:
df.groupby('table_id').size().sort_values(ascending=False)[:75]

NameError: name 'df' is not defined

In [68]:
df[['table_id','tableNclusters']].drop_duplicates()

Unnamed: 0,table_id,tableNclusters
0,LocalBusiness_bimmershops.com_September2020.js...,456
471,LocalBusiness_fourringsrepair.com_September202...,441
916,LocalBusiness_vcarshops.com_September2020.json.gz,433
1351,LocalBusiness_benzshops.com_September2020.json.gz,408
1767,LocalBusiness_pcarshops.com_September2020.json.gz,385
...,...,...
17922,LocalBusiness_integrityroofingllc.com_Septembe...,1
17923,LocalBusiness_swabpro.ca_September2020.json.gz,1
17931,LocalBusiness_mymelanatedbeautiful.com_Septemb...,1
17932,LocalBusiness_business-meets-spirit.de_Septemb...,1


In [69]:
tmp[tmp['origin']=='Hotel_kayak.com_September2020.json.gz']

Unnamed: 0,row_id,origin,name,address,page_url,telephone,addressregion,streetaddress,addresslocality,addresscountry,longitude,latitude,telephone_,phone_object,E.164 format,telephoneNorm
1411028,4,Hotel_kayak.com_September2020.json.gz,Altstadt-Palais Lippischer Hof,"{'addresslocality': 'Bad Salzuflen', 'postalco...",https://www.il.kayak.com/Bad-Salzuflen-Hotels-...,+49 5222 53 40,North Rhine-Westphalia,Mauerstraße 1 - 5,Bad Salzuflen,DE,,,4952225340,"{'country_code': 49, 'extension': None, 'natio...",494952225340,494952225340
1411029,5,Hotel_kayak.com_September2020.json.gz,éL Hotel Royale Bandung,"{'addressregion': 'West Java', 'addresslocalit...",https://www.il.kayak.com/Bandung-Hotels-eL-Hot...,+62 224 232 286,West Java,Jl. Merdeka 2,Bandung,ID,,,62224232286,"{'country_code': 62, 'extension': None, 'natio...",6262224232286,6262224232286
1411030,6,Hotel_kayak.com_September2020.json.gz,Zeynep Hotel,"{'addresslocality': 'Belek', 'addressregion': ...",https://www.il.kayak.com/Belek-Hotels-Zeynep-H...,+90 242 725 4180,Antalya Province,Belek Turizm Merkezi,Belek,TR,,,902427254180,"{'country_code': 90, 'extension': None, 'natio...",902427254180,902427254180
1411031,7,Hotel_kayak.com_September2020.json.gz,Hôtel Georges VI,"{'postalcode': '64200', 'addresscountry': 'Fra...",https://www.il.kayak.com/Biarritz-Hotels-Hotel...,+33 5 59 41 82 88,Pyrénées-Atlantiques,10 Rue Du Port Vieux,Biarritz,FR,,,33559418288,"{'country_code': 33, 'extension': None, 'natio...",33559418288,33559418288
1411032,8,Hotel_kayak.com_September2020.json.gz,Fiesta Inn Chihuahua,"{'streetaddress': 'Boulevard Ortiz Mena 2801',...",https://www.il.kayak.com/Chihuahua-Hotels-Fies...,+52 614 429 0100,Chihuahua,Boulevard Ortiz Mena 2801,Chihuahua,MX,,,526144290100,"{'country_code': 52, 'extension': None, 'natio...",526144290100,526144290100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440570,79526,Hotel_kayak.com_September2020.json.gz,Vital Sporthotel Kristall,"{'addresslocality': 'Finkenberg', 'streetaddre...",https://www.nz.kayak.com/Finkenberg-Hotels-Vit...,+43 528 562 840,Tirol,Dorf 143,Finkenberg,AT,,,43528562840,"{'country_code': 43, 'extension': None, 'natio...",4343528562840,4343528562840
1440571,79527,Hotel_kayak.com_September2020.json.gz,Zhongan Inn Meiyuan Hotel,"{'streetaddress': 'No. 8 Heping Road', 'postal...",https://www.nz.kayak.com/Xi-an-Hotels-Zhongan-...,+86 298 207 6300,Shaanxi,No. 8 Heping Road,Xi'an,CN,,,862982076300,"{'country_code': 86, 'extension': None, 'natio...",862982076300,862982076300
1440572,79541,Hotel_kayak.com_September2020.json.gz,Schulphoek House,"{'postalcode': '7200', 'streetaddress': '181 P...",https://www.il.kayak.com/Hermanus-Hotels-Schul...,+27 283 162 626,Western Cape,"181 Piet Retief St, Sandbaai",Hermanus,ZA,,,27283162626,"{'country_code': 27, 'extension': None, 'natio...",27283162626,27283162626
1440573,79542,Hotel_kayak.com_September2020.json.gz,Beau-Rivage Palace,"{'streetaddress': 'Place du Port 17-19', 'addr...",https://www.il.kayak.com/Lausanne-Hotels-Beau-...,+41 21 613 33 33,Vaud,Place du Port 17-19,Lausanne,CH,,,41216133333,"{'country_code': 41, 'extension': None, 'natio...",41216133333,41216133333
