# This notebook evaluates the predictions of the TinyBert Baseline output

In [1]:
import pandas as pd
from sklearn.metrics import f1_score
import os
import numpy as np

In [2]:
#get path information
product_path = '../../../../src/data/product'
train_test_all_filtered_path_2 = os.path.join(product_path, 'train_test_split/output_unfiltered_tables/large/after_manual_checking/baselines')
data_path = '../../../../src/data'
mapping_corpus_path_2 = data_path + r'/product/lspc2020_to_tablecorpus/Cleaned'
train_test_all_filtered_path = os.path.join(product_path, 'train_test_split/output_unfiltered_tables/large/after_manual_checking')

In [3]:
#load the TinyBert results
res = pd.read_csv('../Baseline/TinyBert_Results/predict_results_None.txt', sep='\t')
res.drop('index', axis=1, inplace=True)

In [4]:
#load the test set
real = pd.read_csv(os.path.join(train_test_all_filtered_path_2,'df_test.csv'))

In [5]:
final = pd.concat([real,res], axis=1)

In [6]:
f1_score(real.label, res.prediction, average='micro')

0.8329422806194275

In [7]:
f1_scores = f1_score(real.label, res.prediction, average=None, labels=real.label)
f1_scores_with_labels = {label:score for label,score in zip(real.label, f1_scores)}

In [8]:
df_f1 = pd.DataFrame.from_dict(f1_scores_with_labels, orient="index").reset_index().rename(columns={'index':'label',0:'f1'}).sort_values(by=['label'])

In [None]:
# get information on label, prediction, and cluster itself in one table
#df_join_sentence = pd.merge(df_f1, real.drop_duplicates(subset=['label']), how="left", on=["label", "label"])

In [None]:
# get information on label, prediction, and cluster itself in one table
#pd.merge(final, df_f1, how="left", on=["label", "label"]).sort_values(by=['label']).to_excel('f1_per_cluster_baseline.xlsx')

# F1 Scores for different domains

In [9]:
domains=['Bikes','Cars','Clothes','Drugstore','Electronics','Random','Technology','Tools']

In [10]:
#get all clusters with information
combined_csv_data = pd.concat([pd.read_csv(os.path.join(mapping_corpus_path_2, f"{file}_cluster_8_tables.csv")) for file in domains]).drop(columns=['Unnamed: 0'])

In [11]:
#get cluster_ids
cluster_list=[]
files_representation_train = [file for file in os.listdir(os.path.join(train_test_all_filtered_path,'train_cleaned')) if file.endswith('.json.gz')]
for zip_file in files_representation_train:
    df = pd.read_json(os.path.join(train_test_all_filtered_path,'train_cleaned') + '/{}'.format(zip_file), compression='gzip', lines=True)
    cluster_list.extend(df['cluster_id'].tolist())
    # get only clusters that are unique
unique_clusters = np.unique(cluster_list)
unique_clusters = np.delete(unique_clusters, 0)

In [12]:
combined_csv_data_filter = combined_csv_data[combined_csv_data['cluster_id'].isin(unique_clusters)].drop_duplicates(subset=['cluster_id'])

In [13]:
#scale cluster_ids
combined_csv_data_filter['label'] = combined_csv_data_filter.groupby('cluster_id').ngroup()
#combine information with predictions and labels
df_domain_f1 = pd.merge(final.drop(columns='sentence1'), combined_csv_data_filter.drop(columns=['cluster_id']), how="left", on=["label", "label"])

In [14]:
for domain in domains:
    f1_domain = f1_score(df_domain_f1[df_domain_f1['domain']==domain].label, df_domain_f1[df_domain_f1['domain']==domain].prediction, average='micro')
    print(f"F1 Score for {domain}: {f1_domain}")

F1 Score for Bikes: 1.0
F1 Score for Cars: 0.8701298701298701
F1 Score for Clothes: 0.7711978465679677
F1 Score for Drugstore: 0.5866336633663366
F1 Score for Electronics: 0.760904170646291
F1 Score for Random: 0.8603326498063341
F1 Score for Technology: 0.9616240266963292
F1 Score for Tools: 0.0


  _warn_prf(


# F1 scores for size of train data

In [18]:
#load the train set
df_train = pd.read_csv(os.path.join(train_test_all_filtered_path_2,'df_train.csv'))
#join label, predictions and amount test set
df_amount_train_f1 = pd.merge(final.drop(columns='sentence1'), df_train.groupby('label').count(), how="left", on=["label", "label"])

In [36]:
bins = [0, 5, 10, 25, 50, 100, 150]
#create bins for categorization
df_amount_train_f1['binned'] = pd.cut(df_amount_train_f1['sentence1'], bins)

In [39]:
for interval in df_amount_train_f1['binned'].unique().to_list():
    f1_train_size = f1_score(df_amount_train_f1[df_amount_train_f1['binned']==interval].label, df_amount_train_f1[df_amount_train_f1['binned']==interval].prediction, average='micro')
    print(f"F1 Score for size of train set per cluster in {interval}: {f1_train_size}")

F1 Score for size of train set per cluster in (10, 25]: 0.9335064935064935
F1 Score for size of train set per cluster in (5, 10]: 0.8269733403031886
F1 Score for size of train set per cluster in (0, 5]: 0.6513605442176871
F1 Score for size of train set per cluster in (25, 50]: 0.9682352941176471
F1 Score for size of train set per cluster in (50, 100]: 0.9788106630211895
F1 Score for size of train set per cluster in (100, 150]: 0.9523809523809523
