In [1]:
from datetime import datetime
from os.path import join
from os import listdir
import json
import re #for camel case conversion
from collections import Counter

from sklearn.metrics import classification_report
import pandas as pd

import altair as alt
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [2]:
def sherlock_case(s):
    s = re.sub(r"(_|-)+", " ", s).title().replace(" ", "")
#     s = ''.join([s[0].lower(), s[1:]])
#     s = ''.join(map(lambda x: x if x.islower() else " "+x, s))
    return s.lower()

In [3]:
true_types_sherlock = list(map(sherlock_case, pd.read_parquet("../../results/true_types/sherlock_validation.parquet").values.flatten()))
true_types = list(map(sherlock_case, pd.read_parquet("../../results/true_types/sato.parquet").values.flatten()))
prediction_sherlock = list(map(sherlock_case, pd.read_parquet("../../results/predictions/sherlock_sato.parquet").values.flatten()))
prediction_sato = list(map(sherlock_case, pd.read_parquet("../../results/predictions/sato_sato.parquet").values.flatten()))

In [4]:
type_freq_df = pd.DataFrame(true_types, columns=['type'])
type_freq_df = pd.DataFrame(type_freq_df['type'].value_counts())
type_freq_df.columns = ['count']
type_freq_df.index.name = 'type'

In [5]:
alt.Chart(type_freq_df.reset_index()).mark_bar(size=15).encode(
    x = alt.X('type:O',
              title = 'Semantic Types',
              sort=alt.EncodingSortField(
                field="count",  
                order="descending")),
    y = alt.Y('count', title='Number of Samples')    
)

In [6]:
print(len(prediction_sherlock))
print(len(prediction_sato))
print(len(true_types))
print(true_types[:5])
print(prediction_sherlock[:5])

120609
120609
120609
['description', 'name', 'artist', 'description', 'name']
['description', 'name', 'name', 'description', 'address']


In [7]:
# # print(prediction_sato)
# print(prediction_sherlock)
# print(true_types)
# for idx, i in enumerate(true_types):
#     if type(i) != str:
#         print(idx)
# print(true_types[407])
# print(prediction_sherlock[407])

In [8]:
# print(classification_report(true_types, prediction_sato))

In [9]:
# class_report = classification_report(true_types, prediction_sato, output_dict=True)
# class_report = dict(list(class_report.items()))
# class_report_df = pd.DataFrame.from_dict(class_report)
# class_report_df.to_csv('csv_report_sato_sato.csv')

In [10]:
# print(classification_report(true_types, prediction_sherlock))

In [11]:
# class_report = classification_report(true_types, prediction_sherlock, output_dict=True)
# class_report = dict(list(class_report.items()))
# class_report_df = pd.DataFrame.from_dict(class_report)
# class_report_df.to_csv('csv_report_sherlock_sato.csv')

In [12]:
report_sherlock = classification_report(true_types, prediction_sherlock, output_dict=True)
report_sherlock_df_input = {k: list(v.values()) for k, v in list(report_sherlock.items())[:-3]} #last 3 are total f1/macro/weigthed, these are not needed
report_sherlock_df = pd.DataFrame.from_dict(report_sherlock_df_input, orient='index', columns=['precision', 'recall', 'f1-score', 'support']).sort_values(by='f1-score', ascending = False)

report_sato = classification_report(true_types, prediction_sato, output_dict=True)
report_sato_df_input = {k: list(v.values()) for k, v in list(report_sato.items())[:-3]} #last 3 are total f1/macro/weigthed, these are not needed
report_sato_df = pd.DataFrame.from_dict(report_sato_df_input, orient='index', columns=['precision', 'recall', 'f1-score', 'support']).sort_values(by='f1-score', ascending = False)


In [13]:
combined_report_df = pd.merge(report_sato_df, report_sherlock_df, left_index=True, right_index=True)
combined_report_df.index.names = ['type']
combined_report_df.columns = ['precision_sato', 'recall_sato', 'f1-score_sato', 'support_sato', 'precision_sherlock', 'recall_sherlock', 'f1-score_sherlock', 'support_sherlock']
# optional, you can change the column to sort by to f1-score_sato, or not sort at all by commenting the next line
combined_report_df = combined_report_df.sort_values(by='f1-score_sherlock', ascending = False) 

In [14]:
combined_report_df

Unnamed: 0_level_0,precision_sato,recall_sato,f1-score_sato,support_sato,precision_sherlock,recall_sherlock,f1-score_sherlock,support_sherlock
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
isbn,0.995575,0.986842,0.991189,228,1.000000,0.964912,0.982143,228
grades,1.000000,0.986301,0.993103,219,0.951965,0.995434,0.973214,219
gender,0.990758,0.950355,0.970136,564,0.979964,0.953901,0.966757,564
industry,0.929864,0.976247,0.952491,421,0.936795,0.985748,0.960648,421
currency,0.970149,0.928571,0.948905,70,0.920000,0.985714,0.951724,70
...,...,...,...,...,...,...,...,...
address,0.938095,0.936980,0.937537,841,0.127688,0.917955,0.224191,841
sales,0.950000,0.633333,0.760000,30,0.122807,0.466667,0.194444,30
ranking,0.727273,0.145455,0.242424,55,0.147059,0.181818,0.162602,55
director,0.818182,0.545455,0.654545,33,0.083333,0.575758,0.145594,33


In [15]:
mismatches_sherlock = list()
mismatches_sherlock_idx = list()
mismatches_sato = list()
mismatches_sato_idx = list()

print_count = 0
for idx, true_type in enumerate(true_types):
    predicted_type_sherlock = prediction_sherlock[idx]
    predicted_type_sato = prediction_sato[idx]

    if true_type != predicted_type_sherlock:
        mismatches_sherlock.append(true_type)
        mismatches_sherlock_idx.append(idx)

    if true_type != predicted_type_sato:
        mismatches_sato.append(true_type)
        mismatches_sato_idx.append(idx)
        
        # zoom in to specific errors
        # if true_type in ('state') and print_count <= 6:
        #     print_count += 1
        #     print(f'Expected "{true_type}" but predicted "{predicted_type}"')
        #     print(f'{data[idx]}\n')
        

mismatch_sherlock_class_count = Counter(mismatches_sherlock)
print(mismatch_sherlock_class_count.most_common()[:10])

mismatches_sato_class_count = Counter(mismatches_sato)
print(mismatches_sato_class_count.most_common()[:10])

[('name', 4363), ('rank', 1755), ('age', 1591), ('team', 1527), ('description', 1488), ('weight', 1431), ('location', 1096), ('type', 899), ('position', 841), ('city', 583)]
[('name', 520), ('category', 508), ('description', 435), ('team', 380), ('notes', 337), ('location', 286), ('type', 264), ('rank', 254), ('position', 192), ('class', 185)]


In [16]:
mismatch_sherlock_freq_df = pd.DataFrame(mismatches_sherlock, columns=['type'])
mismatch_sherlock_freq_df = pd.DataFrame(mismatch_sherlock_freq_df['type'].value_counts())
mismatch_sherlock_freq_df.columns = ['count']
mismatch_sherlock_freq_df.index.name = 'type'

mismatch_sato_freq_df = pd.DataFrame(mismatches_sato, columns=['type'])
mismatch_sato_freq_df = pd.DataFrame(mismatch_sato_freq_df['type'].value_counts())
mismatch_sato_freq_df.columns = ['count']
mismatch_sato_freq_df.index.name = 'type'

type_freq_df = pd.DataFrame(true_types, columns=['type'])
type_freq_df = pd.DataFrame(type_freq_df['type'].value_counts())
type_freq_df.columns = ['count']
type_freq_df.index.name = 'type'

In [17]:
combined_mismatch_freq_df = pd.merge(mismatch_sherlock_freq_df, mismatch_sato_freq_df, left_index=True, right_index=True, how='outer')
combined_mismatch_freq_df = pd.merge(combined_mismatch_freq_df, type_freq_df, left_index=True, right_index=True, how='outer')
combined_mismatch_freq_df.columns=['sherlock_mismatch_freq', 'sato_mismatch_freq', 'true_type_freq']

In [18]:
alt.Chart(combined_mismatch_freq_df.reset_index()).transform_fold(
      ['sherlock_mismatch_freq', 'sato_mismatch_freq', 'true_type_freq'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="true_type_freq",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [19]:
# take a closer look at long tail types
combined_mismatch_freq_df = combined_mismatch_freq_df.sort_values(by='true_type_freq', ascending = False) 
# print(combined_mismatch_freq_df.tail(30))
# print(combined_report_df.sort_values(by='support_sato', ascending = False).tail(30))
long_tail_df = combined_report_df[20:]
small_tail_df = combined_report_df[:20]
print(long_tail_df[['f1-score_sato']].mean())
print(long_tail_df[['f1-score_sherlock']].mean())
print(small_tail_df[['f1-score_sato']].mean())
print(small_tail_df[['f1-score_sherlock']].mean())

f1-score_sato    0.828339
dtype: float64
f1-score_sherlock    0.615016
dtype: float64
f1-score_sato    0.946768
dtype: float64
f1-score_sherlock    0.926792
dtype: float64


In [20]:
# print(combined_mismatch_freq_df)

In [21]:
alt.Chart(combined_report_df.reset_index()).transform_fold(
      ['precision_sato', 'recall_sato', 'f1-score_sato', 'precision_sherlock', 'recall_sherlock', 'f1-score_sherlock'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="support_sherlock",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

### Looking at the score improvement for each type

In [22]:
score_improvement_df = pd.DataFrame()
#precision, recall and f1-score
score_improvement_df["precision"] = report_sato_df["precision"] - report_sherlock_df["precision"]
score_improvement_df["recall"] = report_sato_df["recall"] - report_sherlock_df["recall"]
score_improvement_df["f1-score"] = report_sato_df["f1-score"] - report_sherlock_df["f1-score"]
score_improvement_df["support"] = report_sato_df["support"]
score_improvement_df.index.name = 'type'

In [23]:
print(score_improvement_df.sort_values('f1-score', ascending=False).head(10))

          precision    recall  f1-score  support
type                                            
address    0.810408  0.019025  0.713347      841
sex        0.352610  0.664921  0.597497      382
sales      0.827193  0.166667  0.565556       30
product    0.674031 -0.064690  0.514110      371
director   0.734848 -0.030303  0.508952       33
filesize  -0.057130  0.659091  0.495205      220
teamname   0.617935  0.222222  0.481022      243
person     0.816667 -0.027778  0.421196       36
region     0.511256  0.030769  0.385607      325
capacity   0.505464  0.272727  0.376822       55


In [24]:
alt.Chart(score_improvement_df.reset_index()).transform_fold(
      ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [25]:
sato_top_5_df = report_sato_df.loc[report_sato_df['support']>0].sort_values(by=['f1-score'], ascending=False).head(5)
sato_top_5_df.index.name = 'type'
sato_bottom_5_df = report_sato_df.loc[report_sato_df['support']>0].sort_values(by=['f1-score'], ascending=False).tail(5)
sato_bottom_5_df.index.name = 'type'
sherlock_top_5_df = report_sherlock_df.loc[report_sherlock_df['support']>0].sort_values(by=['f1-score'], ascending=False).head(5)
sherlock_top_5_df.index.name = 'type'
sherlock_bottom_5_df = report_sherlock_df.loc[report_sherlock_df['support']>0].sort_values(by=['f1-score'], ascending=False).tail(5)
sherlock_bottom_5_df.index.name = 'type'

In [26]:
# altair chart representing the top 5 for both sato and sherlock
alt.Chart(sato_top_5_df.reset_index()).transform_fold(
        ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [27]:
alt.Chart(sherlock_top_5_df.reset_index()).transform_fold(
        ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [28]:
alt.Chart(sato_bottom_5_df.reset_index()).transform_fold(
        ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [29]:
alt.Chart(sherlock_bottom_5_df.reset_index()).transform_fold(
        ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [30]:
# Correlation between support in original Sherlock dataset and F1-score on Sato dataset
corr_types = set(true_types_sherlock)
corr_data = pd.DataFrame(columns=['support'], index=list(corr_types))
sherlock_occur_df = pd.DataFrame(true_types_sherlock, columns=['true_types'])
corr_data['support'] = sherlock_occur_df['true_types'].value_counts()
corr_data = corr_data.join(report_sherlock_df[['f1-score']]) #['f1-score'] = report_sherlock_df
corr_data.reset_index(inplace=True)
worst_5 = list(sherlock_bottom_5_df.index.values)
top_5 = list(sherlock_top_5_df.index.values)
alt.Chart(corr_data).mark_circle(size=60).encode(
    x='support',
    y='f1-score',
    color=alt.condition(
        alt.Predicate(alt.FieldOneOfPredicate(field='index', oneOf=worst_5)),
        alt.value('red'),
        alt.value('steelblue')
    ),
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
)

In [31]:
path = '../../../sato/table_data/sato_tables/all'

In [32]:
filepaths = [join(path, f) for f in listdir(path) if f.endswith('.csv')]
print(len(filepaths))
tables_of_interest = {}
single_col_tables = {}
multi_col_tables = {}
single_col_true_types = []
single_col_predictions_sato = []
single_col_predictions_sherlock = []
multi_col_true_types = []
multi_col_predictions_sato = []
col_idx = 0
multi_col_sherlock_increase = {}
sato_increase_count = 0

for subdir in listdir(path):
    print(subdir)
    if(subdir == 'processed'): continue
    path_subdir = join(path, subdir)
    #loop over all files in subdir
    for file in listdir(path_subdir):
        fp = join(path_subdir, file)
        table_id = fp[fp.rfind('/')+1:-4]
        df = pd.read_csv(fp)
        table_header = list(df)
#         print(table_header)
        table_df = pd.read_csv(fp, index_col=0)

        table_predictions_sato = []
        table_predictions_sherlock = []
        table_true_types = list(map(lambda x: x.split('.')[0], table_header))
#         print(table_true_types)

        for idx, col_type in enumerate(table_header):        
              table_predictions_sato.append(prediction_sato[col_idx])
              table_predictions_sherlock.append(prediction_sherlock[col_idx])
              col_idx += 1

        #check count of matches between table_predictions_sato and table_true_types
        #check count of matches between table_predictions_sherlock and table_true_types
        table_predictions_sato_count = 0
        table_predictions_sherlock_count = 0
        for idx, col_true_type in enumerate(table_true_types):
          if col_true_type == table_predictions_sato[idx]:
            table_predictions_sato_count += 1
          if col_true_type == table_predictions_sherlock[idx]:
            table_predictions_sherlock_count += 1

        if len(table_header) == 1:
            single_col_true_types.extend(table_true_types)
            single_col_predictions_sato.extend(table_predictions_sato)
            single_col_predictions_sherlock.extend(table_predictions_sherlock)
            single_col_tables[table_id] = {
                'true_types': table_true_types,
                'predictions_sato': table_predictions_sato,
                'predictions_sherlock': table_predictions_sherlock,
                'predictions_sato_count': table_predictions_sato_count,
                'predictions_sherlock_count': table_predictions_sherlock_count,
                'table_df': table_df
            }
            continue
        else:
            multi_col_true_types.extend(table_true_types)
            multi_col_predictions_sato.extend(table_predictions_sato)
            multi_col_tables[table_id] = {
                'true_types': table_true_types,
                'predictions_sato': table_predictions_sato,
                'predictions_sherlock': table_predictions_sherlock,
                'predictions_sato_count': table_predictions_sato_count,
                'predictions_sherlock_count': table_predictions_sherlock_count,
                'table_df': table_df
            }

        # Are there any multi-column tables where sherlock gets more correct predictions?
        if table_predictions_sato_count == table_predictions_sherlock_count: continue
        elif table_predictions_sato_count < table_predictions_sherlock_count: 
            multi_col_sherlock_increase[table_id] = {
                'true_types': table_true_types,
                'predictions_sato': table_predictions_sato,
                'predictions_sherlock': table_predictions_sherlock,
                'predictions_sato_count': table_predictions_sato_count,
                'predictions_sherlock_count': table_predictions_sherlock_count,
                'table_df': table_df
            }
        else: 
            sato_increase_count+= 1

        #if sherlock did not get any correct predictions, check if sato predicted at least 70% of the columns correct
        # otherwise check if the match count increased by 90% in sato compared to sherlock
        if (table_predictions_sherlock_count == 0 and table_predictions_sato_count==0): continue
        if ((table_predictions_sato_count/len(table_true_types) > 70/100) or (table_predictions_sherlock_count != 0 and ((table_predictions_sato_count-table_predictions_sherlock_count)/table_predictions_sherlock_count)*100 > 70/100)):
              tables_of_interest[table_id] = {
                'true_types': table_true_types,
                'predictions_sato': table_predictions_sato,
                'predictions_sherlock': table_predictions_sherlock,
                'predictions_sato_count': table_predictions_sato_count,
                'predictions_sherlock_count': table_predictions_sherlock_count,
                'table_df': table_df
              }


0
processed
K1
K2
K0
K3
K4


In [33]:
print(len(multi_col_tables))
print(len(single_col_tables))
print(len(multi_col_tables)+len(single_col_tables))

31158
47575
78733


In [34]:
single_col_true_types = list(map(sherlock_case,single_col_true_types))     
multi_col_true_types = list(map(sherlock_case,multi_col_true_types))

In [35]:
# Sanitiy check
# t = single_col_true_types.copy()
# t.extend(multi_col_true_types)
# p = single_col_predictions_sato.copy()
# p.extend(multi_col_predictions_sato)
# print(classification_report(t, p))

In [36]:
# print(classification_report(single_col_true_types, single_col_predictions_sato))

In [37]:
# print(classification_report(multi_col_true_types, multi_col_predictions_sato))

In [38]:
correct_prediction_vs_table_size = {}
all_tables = {**multi_col_tables, **single_col_tables}
table_sizes = [len(v['true_types']) for k, v in all_tables.items()]
for i in range(min(table_sizes), max(table_sizes)+1):
    correct_prediction_vs_table_size[i] = []
for k, t in all_tables.items():
    correct_prediction_vs_table_size[len(t['true_types'])].append(t['predictions_sato_count']/len(t['true_types']))
for k, v in correct_prediction_vs_table_size.items():
    correct_prediction_vs_table_size[k] = sum(correct_prediction_vs_table_size[k])/len(correct_prediction_vs_table_size[k])
    
print(correct_prediction_vs_table_size)
df_dict = {'col_amount': list(correct_prediction_vs_table_size.keys()), 'corr_pred_freq': list(correct_prediction_vs_table_size.values())}
correct_prediction_vs_table_size_df = pd.DataFrame(df_dict)
alt.Chart(correct_prediction_vs_table_size_df.reset_index()).mark_bar(size=40).encode(
    x = alt.X('col_amount',
              type='ordinal',
              title = 'Amount of Columns',
#               sort=alt.EncodingSortField(
#                 field="values",  
#                 order="descending"),
              ),
    y = alt.Y('corr_pred_freq', title='Correct prediction frequency'),
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).configure_axisX(
    labelAngle=0
).properties(width=500,height=200)

{1: 0.9238465580662113, 2: 0.9538664876431838, 3: 0.9571766731224428, 4: 0.95206002034588, 5: 0.9918194640338506, 6: 0.888888888888889}


In [39]:
freq_dict = dict(Counter(table_sizes))
print(freq_dict)
df_dict = {'col_amount': list(freq_dict.keys()), 'freq': list(freq_dict.values())}
col_amount_freq_df = pd.DataFrame(df_dict)
alt.Chart(col_amount_freq_df.reset_index()).mark_bar(size=40).encode(
    x = alt.X('col_amount',
              type='ordinal',
              title = 'Amount of Columns',
#               sort=alt.EncodingSortField(
#                 field="values",  
#                 order="descending"),
              ),
    y = alt.Y('freq', title='Number of Tables'),
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).configure_axisX(
    labelAngle=0
).properties(width=500,height=200)

{2: 23833, 3: 4647, 4: 1966, 5: 709, 6: 3, 1: 47575}


In [40]:
single_col_report = classification_report(single_col_true_types, single_col_predictions_sato, output_dict=True)
single_col_report_input = {k: list(v.values()) for k, v in list(single_col_report.items())[:-3]} #last 3 are total f1/macro/weigthed, these are not needed
single_col_report_df = pd.DataFrame.from_dict(single_col_report_input, orient='index', columns=['precision', 'recall', 'f1-score', 'support']).sort_values(by='f1-score', ascending = False)

multi_col_report = classification_report(multi_col_true_types, multi_col_predictions_sato, output_dict=True)
multi_col_report_input = {k: list(v.values()) for k, v in list(multi_col_report.items())[:-3]} #last 3 are total f1/macro/weigthed, these are not needed
multi_col_report_df = pd.DataFrame.from_dict(multi_col_report_input, orient='index', columns=['precision', 'recall', 'f1-score', 'support']).sort_values(by='f1-score', ascending = False)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
# Correlation between support in Sato dataset and F1-score on single-column tables
corr_types = set(true_types)
corr_data = pd.DataFrame(columns=['support'], index=list(corr_types))
occur_df = pd.DataFrame(true_types, columns=['true_types'])
corr_data['support'] = occur_df['true_types'].value_counts()

corr_data = corr_data.loc[corr_data['support'] <= 100]

# !change single_col_report_df[['f1-score']] to multi_col_report_df[['f1-score']] for multi-column tables!
corr_data = corr_data.join(multi_col_report_df[['f1-score']]) #['f1-score'] = report_sherlock_df
corr_data.reset_index(inplace=True)
alt.Chart(corr_data).mark_circle(size=60).encode(
    x='support',
    y='f1-score',
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
)

In [42]:
print(corr_data['f1-score'].mean())

0.7211157388477227


In [43]:
# score improvement single-col vs. multi-col
score_improvement_df = pd.DataFrame()
#precision, recall and f1-score
score_improvement_df["precision"] = multi_col_report_df["precision"] - single_col_report_df["precision"]
score_improvement_df["recall"] = multi_col_report_df["recall"] - single_col_report_df["recall"]
score_improvement_df["f1-score"] = multi_col_report_df["f1-score"] - single_col_report_df["f1-score"]
score_improvement_df["support"] = multi_col_report_df["support"]
score_improvement_df.index.name = 'type'

In [44]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3
print(sorted(intersection(list(multi_col_report_df.index),list(single_col_report_df.index))))
print(score_improvement_df.sort_values('f1-score', ascending=False).head(30))

['address', 'affiliate', 'affiliation', 'age', 'album', 'area', 'artist', 'birthdate', 'brand', 'capacity', 'category', 'city', 'class', 'classification', 'club', 'code', 'collection', 'command', 'company', 'component', 'continent', 'country', 'county', 'creator', 'credit', 'currency', 'day', 'depth', 'description', 'director', 'duration', 'education', 'elevation', 'family', 'filesize', 'format', 'gender', 'genre', 'grades', 'industry', 'isbn', 'jockey', 'language', 'location', 'manufacturer', 'name', 'nationality', 'notes', 'operator', 'order', 'origin', 'owner', 'person', 'plays', 'position', 'product', 'publisher', 'range', 'rank', 'ranking', 'region', 'religion', 'requirement', 'result', 'sales', 'service', 'sex', 'species', 'state', 'status', 'symbol', 'team', 'teamname', 'type', 'weight', 'year']
             precision    recall  f1-score  support
type                                               
continent     1.000000  0.619048  0.764706     21.0
teamname      0.623585  0.5122

In [45]:
print(len(tables_of_interest))

7499
