# This notebook was used to create the TURL input representation tables and all necessary files

In [2]:
import pandas as pd
import os

In [29]:
product_path = '../../../../src/data/product'
train_test_all_filtered_path = os.path.join(product_path, 'train_test_split/output_unfiltered_tables/large/after_manual_checking')
files_representation_train = [file for file in os.listdir(os.path.join(train_test_all_filtered_path,'val_cleaned')) if file.endswith('.json.gz')]
turl_input_path = os.path.join(product_path, 'TURL/input')

# Generate representations for rewritten and transposed versions. Attention: the generation has to be done seperatly by changing paths with test, val and train

In [1]:
# # use clean tables to get table representation for TURL rewritten
train_representation=[]
for zip_file in files_representation_train:
    print('/{}'.format(zip_file))
    df = pd.read_json(os.path.join(train_test_all_filtered_path,'test_cleaned') + '/{}'.format(zip_file), compression='gzip', lines=True)
    if ('description' in df.columns)== False: # check if description column given
        df['description'] = ''
    if ('name' in df.columns)== False: # check if description column given
        df['name'] = df.tokens
    df_cleaned=df[df['cluster_id']!=-100].reset_index().drop(columns=['index'])
    table_representation = [] #empty list for table
    table_representation.append(zip_file)#append table id
    table_representation.append('')#append page title -> not relevant
    table_representation.append('')#append wikipedia page id -> not given
    table_representation.append('')#append information about entity -> product, not relevant since same for all tables
    table_representation.append('')#append table caption -> not given
    table_representation.append(['name','description']) #append headers -> not sure if we should do that
    all_rows_representation = []#representation of all rows
    column_1_representation=[] # cell representation of column 1    
    column_2_representation=[] # cell representation of column 1
    for i in range(len(df_cleaned)):
        column_1_representation.append([[i,0],[df_cleaned['row_id'][i],str(df_cleaned['name'][i])]])
        column_2_representation.append([[i,1],[df_cleaned['row_id'][i],str(df_cleaned['description'][i])]])
    all_rows_representation.append(column_1_representation)
    all_rows_representation.append(column_2_representation) #append single column representation to representation of all rows
    table_representation.append(all_rows_representation)#append it to representation of whole table
    table_representation.append(df_cleaned['cluster_id'].apply(lambda x: [str(x)]).to_list())
    train_representation.append(table_representation)

In [2]:
# use clean tables to get table representation for TURL with transposed matrix
train_representation=[] #representation of whole training set
for zip_file in files_representation_train:
    print('/{}'.format(zip_file))
    df = pd.read_json(os.path.join(train_test_all_filtered_path,'val_cleaned') + '/{}'.format(zip_file), compression='gzip', lines=True)
    if ('description' in df.columns)== False: # check if description column given
        df['description'] = ''
    if ('name' in df.columns)== False: # check if description column given
        df['name'] = df.tokens
    df['header']='Product'#empty string for headers
    df_cleaned=df[df['cluster_id']!=-100].reset_index().drop(columns=['index'])#get rid of clusters with -100 as data is too much
    table_representation = [] #empty list for table
    table_representation.append(zip_file)#append table id
    table_representation.append('')#append page title -> not relevant
    table_representation.append('')#append wikipedia page id -> not given
    table_representation.append('')#append information about entity -> product, not relevant since same for all tables
    table_representation.append('')#append table caption -> not given
    table_representation.append(df_cleaned['header'].to_list()) #append headers -> not sure if we should do that
    all_rows_representation = []#representation of all rows
    for i in range(len(df_cleaned)):
        row_representation=[] # cell representation of single row
        row_representation.append([[0,i],[df_cleaned['row_id'][i],str(df_cleaned['name'][i])]])
        row_representation.append([[1,i],[df_cleaned['row_id'][i],str(df_cleaned['description'][i])]])
        all_rows_representation.append(row_representation) #append single cell representation to representation of all rows
    table_representation.append(all_rows_representation)#append it to representation of whole table
    table_representation.append(df_cleaned['cluster_id'].apply(lambda x: [str(x)]).to_list())
    train_representation.append(table_representation)

In [11]:
# #cannot read numpy integers in json -> encoder for saving
import json
import numpy as np

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [32]:
#save data as json
with open(os.path.join(turl_input_path, 'val_cleaned_representation_header_product_cleaned.json'), 'w') as f:
    json.dump(json.dumps(train_representation, cls=NpEncoder), f)

In [6]:
# # generate type_vocab.txt
df_train= pd.read_json(os.path.join(train_test_all_filtered_path,'train/concatenated_data/train_all_filtered_tables.json.gz'), compression='gzip', lines=True)
df_test= pd.read_json(os.path.join(train_test_all_filtered_path,'test/concatenated_data/test_all_filtered_tables.json.gz'), compression='gzip', lines=True)
df_val= pd.read_json(os.path.join(train_test_all_filtered_path,'val/concatenated_data/val_all_filtered_tables.json.gz'), compression='gzip', lines=True)
ids = df_train.cluster_id.astype(str).unique()
ids = np.append(ids,'-100')
len(ids)
#save list of cluster ids to txt file
pd.DataFrame(ids).to_csv(path_or_buf=os.path.join(turl_input_path, 'type_vocab_clusters.txt'),sep='\t', index=True, header=False)

# Generate representation for LocalBusiness data with the same setting

In [68]:
product_path = '../../../../src/data/LocalBusiness'
train_test_all_filtered_path = os.path.join(product_path, 'Splitting_ManualCheck/Train_Validation_Test')
files_representation_train = [file for file in os.listdir(os.path.join(train_test_all_filtered_path,'train_tables_cleaned')) if file.endswith('.csv')]
files_representation_train = [file for file in os.listdir(os.path.join(train_test_all_filtered_path,'validation_tables_cleaned')) if file.endswith('.csv')]
turl_input_path = os.path.join(product_path, 'TURL/input')

In [3]:
# # use clean tables to get table representation for TURL rewritten without progressbar
train_representation=[]
for zip_file in files_representation_train:
    print('/{}'.format(zip_file))
    df = pd.read_csv(os.path.join(train_test_all_filtered_path,'validation_tables_cleaned') + '/{}'.format(zip_file))
    if ('name' in df.columns)== False: # check if description column given
        df['name'] = df.page_url
    if df[df['cluster_id']!=-100].empty == False: #check if table is empty
        df_cleaned=df[df['cluster_id']!=-100].reset_index().drop(columns=['index'])
        table_representation = [] #empty list for table
        table_representation.append(zip_file)#append table id
        table_representation.append('')#append page title -> not relevant
        table_representation.append('')#append wikipedia page id -> not given
        table_representation.append('')#append information about entity -> product, not relevant since same for all tables
        table_representation.append('')#append table caption -> not given
        table_representation.append(['name']) #append headers -> not sure if we should do that
        all_rows_representation = []#representation of all rows
        column_1_representation=[] # cell representation of column 1    
        for i in range(len(df_cleaned)):
            column_1_representation.append([[i,0],[df_cleaned.index[i],str(df_cleaned['name'][i])]])
        all_rows_representation.append(column_1_representation)#append single column representation to representation of all rows
        table_representation.append(all_rows_representation)#append it to representation of whole table
        table_representation.append(df_cleaned['cluster_id'].apply(lambda x: [str(x)]).to_list())
        train_representation.append(table_representation)

In [4]:
# # use clean tables to get table representation for TURL with transposed matrix
train_representation=[] #representation of whole training set
for zip_file in files_representation_train:
    print('/{}'.format(zip_file))
    df = pd.read_csv(os.path.join(train_test_all_filtered_path,'train_tables_cleaned') + '/{}'.format(zip_file))
    if ('name' in df.columns)== False: # check if description column given
        df['name'] = df.page_url
    df['header']='LocalBusiness'#empty string for headers
    if df[df['cluster_id']!=-100].empty == False: #check if table is empty
        df_cleaned=df[df['cluster_id']!=-100].reset_index().drop(columns=['index'])#get rid of clusters with -100 as data is too much
        table_representation = [] #empty list for table
        table_representation.append(zip_file)#append table id
        table_representation.append('')#append page title -> not relevant
        table_representation.append('')#append wikipedia page id -> not given
        table_representation.append('')#append information about entity -> product, not relevant since same for all tables
        table_representation.append('')#append table caption -> not given
        table_representation.append(df_cleaned['header'].to_list()) #append headers -> not sure if we should do that
        all_rows_representation = []#representation of all rows
        for i in range(len(df_cleaned)):
            row_representation=[] # cell representation of single row
            row_representation.append([[0,i],[df_cleaned.index[i],str(df_cleaned['name'][i])]])
            all_rows_representation.append(row_representation) #append single cell representation to representation of all rows
        table_representation.append(all_rows_representation)#append it to representation of whole table
        table_representation.append(df_cleaned['cluster_id'].apply(lambda x: [str(x)]).to_list())
        train_representation.append(table_representation)

# Save as json

In [5]:
#cannot read numpy integers in json -> encoder for saving
import json
import numpy as np

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [20]:
with open(os.path.join(turl_input_path, 'test_representation_rewritten_cleaned_lb.json'), 'w') as f:
    json.dump(json.dumps(train_representation, cls=NpEncoder), f)

In [71]:
with open(os.path.join(turl_input_path, 'val_cleaned_representation_rewritten_cleaned_lb.json'), 'w') as f:
    json.dump(json.dumps(train_representation, cls=NpEncoder), f)

In [5]:
#generate type vocab as list of clusters for input in TURL
cluster_list=[]
files_representation_train = [file for file in os.listdir(os.path.join(train_test_all_filtered_path,'test tables')) if file.endswith('.csv')]
for zip_file in files_representation_train:
    print('/{}'.format(zip_file))
    df = pd.read_csv(os.path.join(train_test_all_filtered_path,'test tables') + '/{}'.format(zip_file))
    cluster_list.extend(df['cluster_id'].tolist())
    # get only clusters that are unique
unique_clusters = np.unique(cluster_list)
unique_clusters = np.delete(unique_clusters, 0)
#save list of cluster ids to txt file
pd.DataFrame(unique_clusters.astype(str)).to_csv(path_or_buf=os.path.join(turl_input_path, 'type_vocab_clusters_lb.txt'),sep='\t', index=True, header=False)