In [None]:
# Based on some preliminary tests, my idea for table & entity selection is as follows:

# - start with top 30 tables with largets 'tableNclusters' as candidates; in each iteration, randomly select 500 entities 
#   calculate the number of cluster occurences for each table; save the results
# - redo the above step for N=10000 times, and calculate the mean of cluster occurences for each table
# - select the 15 tables with largest mean occurence. The underlying assumption is that, if a table is more likely to 
#   cover some randomly selected entities, it should have a higher probability of including the 500 selected entities
#   in the end.
# - do the above random sampling again on entities and select the 500 entities that are most likely to be included in the
#   15 tables selected from the last step.

In [48]:
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [49]:
source_path = r"C:\Users\chench10\Downloads\-" 
file = 'MFile_cleaned_upper'

df = pd.read_json(os.path.join(source_path, file), compression='gzip', orient='records', lines=True) 

In [50]:
df_copy = df.copy()

In [51]:
df = df_copy

tmp = pd.DataFrame(df.groupby(['table_id'])['cluster_id'].nunique()).reset_index()
tmp.columns = ['table_id','tableNclusters']
df = df.merge(tmp, on='table_id', how='left').sort_values(by='tableNclusters', ascending=False)

tmp = pd.DataFrame(df.groupby(['cluster_id'])['table_id'].nunique()).reset_index()
tmp.columns = ['cluster_id','clusterNtables']
df = df.merge(tmp, on='cluster_id', how='left')

In [52]:
df

Unnamed: 0,cluster_id,table_id,row_id,page_url,tableNclusters,clusterNtables
0,1,Hotel_marriott.com_September2020.json.gz,5860,https://whattoexpect.marriott.com/rommd,3096,2
1,4342,Hotel_marriott.com_September2020.json.gz,5229,https://www.marriott.com/hotels/local-things-t...,3096,3
2,4359,Hotel_marriott.com_September2020.json.gz,5971,https://www.marriott.com/hotels/travel/bosfx-r...,3096,3
3,4358,Hotel_marriott.com_September2020.json.gz,7383,https://www.marriott.com/hotels/hotel-reviews/...,3096,2
4,4354,Hotel_marriott.com_September2020.json.gz,5552,https://www.marriott.com/hotels/hotel-deals/bo...,3096,2
...,...,...,...,...,...,...
40506,10126,Restaurant_acquaefarinaparis.com_September2020...,14,http://www.acquaefarinaparis.com/el/address-co...,1,2
40507,3864,LocalBusiness_potspace.com_September2020.json.gz,93,https://www.potspace.com/dispensaries/los-ange...,1,2
40508,10126,Restaurant_acquaefarinaparis.com_September2020...,0,https://www.acquaefarinaparis.com/nl/,1,2
40509,10126,Restaurant_acquaefarinaparis.com_September2020...,13,http://www.acquaefarinaparis.com/,1,2


### Table Selection

In [53]:
ncluster = df['cluster_id'].nunique()
ntables  = df['table_id'].nunique()

In [54]:
# np.random.seed(42)
# table_idx = np.random.randint(0, ntables, 15)   # 15 is the number of selected tables
# table_candidates = tables[table_idx]

# cluster_idx = np.random.randint(0, ntables, 500)   # 500 is the number of selected clusters
# cluster_candidates = clusters[cluster_idx]

In [55]:
# initialize a dictionary to store cluster frequency
# cluster_fre = {key: 0 for key in range(ncluster)}

# for t in table_candidates:
#     clusters = df[df['table_id']==t]['cluster_id'].unique()
#     for c in clusters:
#         cluster_fre[c] += 1
        
# results =pd.DataFrame.from_dict(cluster_fre, orient='index', columns=['frequency']).sort_values('frequency', ascending=False)

In [101]:
np.random.seed(42)

sampling_tables = 50
sampling_times = 1000

tables = np.array(df['table_id'].unique())[:sampling_tables]   
clusters = np.array(df['cluster_id'].unique())

array1 = np.ndarray((sampling_times, sampling_tables))
for i in range(sampling_times):             # number of sampling attempts
    table_candidates = tables
    cluster_candidates = np.random.choice(clusters, 500, replace=False)   # 500 is the number of selected clusters

    list_frequency =[]
    for t in table_candidates:
        l1 = set(df[df['table_id']==t].cluster_id.unique())  # clusters that are included in the table  
        l2 = set(cluster_candidates)                         # choosen clusters

        list_frequency.append(len(l1.intersection(l2)))      
    
    #print(list_frequency)
    #if mean(list_frequency) > 200:
    #    break
    array2 = np.append(array1, list_frequency).reshape(-1,sampling_tables)

In [102]:
print(np.mean(array2, axis=0))

[0.11788212 0.1008991  0.07692308 0.06893107 0.06893107 0.04095904
 0.03396603 0.03496503 0.03196803 0.02797203 0.02997003 0.01798202
 0.02097902 0.02097902 0.02097902 0.01798202 0.01798202 0.01298701
 0.01498501 0.00999001 0.01398601 0.01398601 0.00899101 0.01598402
 0.00899101 0.01198801 0.00699301 0.00599401 0.003996   0.00799201
 0.00799201 0.01298701 0.00699301 0.00899101 0.00599401 0.00799201
 0.004995   0.002997   0.00999001 0.003996   0.002997   0.004995
 0.003996   0.00899101 0.00899101 0.004995   0.00599401 0.00799201
 0.003996   0.002997  ]


In [103]:
idx = np.argpartition(np.mean(array2, axis=0), -30)[-30:]     # first narrow the tables to 30
tables_selected = tables[idx]
tables_selected

array(['Hotel_chateauxhotels.co.uk_September2020.json.gz',
       'LocalBusiness_visitdenmark.com_September2020.json.gz',
       'LocalBusiness_infinitishops.com_September2020.json.gz',
       'Restaurant_slicelife.com_September2020.json.gz',
       'Hotel_marriott.co.uk_September2020.json.gz',
       'LocalBusiness_exoticautoshops.com_September2020.json.gz',
       'LocalBusiness_travelks.com_September2020.json.gz',
       'LocalBusiness_lansingsports.org_September2020.json.gz',
       'LocalBusiness_lansing.org_September2020.json.gz',
       'LocalBusiness_acushops.com_September2020.json.gz',
       'LocalBusiness_saabshops.com_September2020.json.gz',
       'Hotel_ihg.com_September2020.json.gz',
       'LocalBusiness_lexrepairshops.com_September2020.json.gz',
       'LocalBusiness_jagshops.com_September2020.json.gz',
       'LocalBusiness_homify.com.my_September2020.json.gz',
       'LocalBusiness_homify.com_September2020.json.gz',
       'LocalBusiness_lrshops.com_September2020.jso

### Entity Selection

In [104]:
# select the 15 tables with largest mean occurence (15 is the number of selected tables)
# then do the above random sampling again on entities and select the 500 entity with largest mean inclusion

In [122]:
# initialize a dictionary to store cluster frequency
cluster_fre = {key: 0 for key in range(1, ncluster+1)}

for t in tables_selected:
    clusters = df[df['table_id']==t]['cluster_id'].unique()
    for c in clusters:
        cluster_fre[c] += 1

In [124]:
cluster_fre
results =pd.DataFrame.from_dict(cluster_fre, orient='index', columns=['frequency']).sort_values('frequency', ascending=False)

In [125]:
clusters_selected = results[:500].index
clusters_selected

Int64Index([9600, 2731, 6284, 9886, 7767, 2884, 2715, 8824, 7280, 3438,
            ...
            8868, 1478, 1479, 4138, 4118, 9538, 1492, 6878, 1630, 7062],
           dtype='int64', length=500)

### Testing

In [126]:
list_test = []
for t in tables_selected:
        l1 = set(df[df['table_id']==t].cluster_id.unique())  # clusters that are included in the table  
        l2 = set(clusters_selected)                          # choosen clusters

        list_test.append(len(l1.intersection(l2))) 

In [127]:
list_test

[0,
 0,
 205,
 0,
 0,
 171,
 0,
 0,
 0,
 220,
 258,
 0,
 319,
 351,
 0,
 0,
 377,
 383,
 0,
 0,
 408,
 456,
 403,
 432,
 470,
 485,
 0,
 0,
 0,
 0]