In [1]:
from datetime import datetime
from os.path import join
from os import listdir
import json
import re #for camel case conversion
from collections import Counter

from sklearn.metrics import classification_report
import pandas as pd

import altair as alt
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [2]:
def sherlock_case(s):
    s = re.sub(r"(_|-)+", " ", s).title().replace(" ", "")
#     s = ''.join([s[0].lower(), s[1:]])
#     s = ''.join(map(lambda x: x if x.islower() else " "+x, s))
    return s.lower()

In [3]:
true_types = list(map(sherlock_case, pd.read_parquet("../../results/true_types/sherlock_validation.parquet").values.flatten()))
prediction_sherlock = list(map(sherlock_case, pd.read_parquet("../../results/predictions/sherlock_sherlock_validation.parquet").values.flatten()))
prediction_sato = list(map(sherlock_case, pd.read_parquet("../../results/predictions/sato_sherlock_validation_137353.parquet").values.flatten()))

In [4]:
type_freq_df = pd.DataFrame(true_types, columns=['type'])
type_freq_df = pd.DataFrame(type_freq_df['type'].value_counts())
type_freq_df.columns = ['count']
type_freq_df.index.name = 'type'

In [5]:
alt.Chart(type_freq_df.reset_index()).mark_bar(size=15).encode(
    x = alt.X('type:O',
              title = 'Semantic Types',
              sort=alt.EncodingSortField(
                field="count",  
                order="descending")),
    y = alt.Y('count', title='Number of Samples')    
)

In [6]:
print(len(prediction_sherlock))
print(len(prediction_sato))
print(len(true_types))
print(true_types[:5])
print(prediction_sherlock[:5])
print(prediction_sato[:5])
# print(true_types[100000:100005])
# print(prediction_sherlock[100000:100005])
# print(prediction_sato[-1])

137353
137353
137353
['county', 'collection', 'age', 'jockey', 'album']
['county', 'collection', 'age', 'jockey', 'album']
['county', 'location', 'region', 'jockey', 'album']


In [7]:
# # print(prediction_sato)
# print(prediction_sherlock)
# print(true_types)
# for idx, i in enumerate(true_types):
#     if type(i) != str:
#         print(idx)
# print(true_types[407])
# print(prediction_sherlock[407])

In [8]:
# print(classification_report(true_types, prediction_sato))

In [9]:
class_report = classification_report(true_types, prediction_sato, output_dict=True)
class_report = dict(list(class_report.items()))
class_report_df = pd.DataFrame.from_dict(class_report)
class_report_df.to_csv('csv_report_sato_sherlock.csv')

In [10]:
# print(classification_report(true_types, prediction_sherlock))

In [11]:
class_report = classification_report(true_types, prediction_sherlock, output_dict=True)
class_report = dict(list(class_report.items()))
class_report_df = pd.DataFrame.from_dict(class_report)
class_report_df.to_csv('csv_report_sherlock_sherlock.csv')

In [12]:
report_sherlock = classification_report(true_types, prediction_sherlock, output_dict=True)
report_sherlock_df_input = {k: list(v.values()) for k, v in list(report_sherlock.items())[:-3]} #last 3 are total f1/macro/weigthed, these are not needed
report_sherlock_df = pd.DataFrame.from_dict(report_sherlock_df_input, orient='index', columns=['precision', 'recall', 'f1-score', 'support']).sort_values(by='f1-score', ascending = False)

report_sato = classification_report(true_types, prediction_sato, output_dict=True)
report_sato_df_input = {k: list(v.values()) for k, v in list(report_sato.items())[:-3]} #last 3 are total f1/macro/weigthed, these are not needed
report_sato_df = pd.DataFrame.from_dict(report_sato_df_input, orient='index', columns=['precision', 'recall', 'f1-score', 'support']).sort_values(by='f1-score', ascending = False)


In [13]:
combined_report_df = pd.merge(report_sato_df, report_sherlock_df, left_index=True, right_index=True)
combined_report_df.index.names = ['type']
combined_report_df.columns = ['precision_sato', 'recall_sato', 'f1-score_sato', 'support_sato', 'precision_sherlock', 'recall_sherlock', 'f1-score_sherlock', 'support_sherlock']
# optional, you can change the column to sort by to f1-score_sato, or not sort at all by commenting the next line
combined_report_df = combined_report_df.sort_values(by='f1-score_sherlock', ascending = False) 

In [31]:
print(combined_report_df)

           precision_sato  recall_sato  f1-score_sato  support_sato  \
type                                                                  
isbn             0.998521     0.937500       0.967049          1440   
grades           0.993321     0.939690       0.965762          1741   
jockey           0.845352     0.690806       0.760305          2817   
industry         0.926480     0.925276       0.925878          3078   
birthdate        0.941019     0.774834       0.849879           453   
...                   ...          ...            ...           ...   
rank             0.398416     0.748226       0.519962          2959   
person           0.689655     0.036166       0.068729           553   
director         0.117647     0.024390       0.040404           246   
sales            0.166667     0.003390       0.006645           295   
ranking          0.325581     0.031461       0.057377           445   

           precision_sherlock  recall_sherlock  f1-score_sherlock  \
type   

In [15]:
mismatches_sherlock = list()
mismatches_sherlock_idx = list()
mismatches_sato = list()
mismatches_sato_idx = list()

print_count = 0
for idx, true_type in enumerate(true_types):
    predicted_type_sherlock = prediction_sherlock[idx]
    predicted_type_sato = prediction_sato[idx]

    if true_type != predicted_type_sherlock:
        mismatches_sherlock.append(true_type)
        mismatches_sherlock_idx.append(idx)

    if true_type != predicted_type_sato:
        mismatches_sato.append(true_type)
        mismatches_sato_idx.append(idx)
        
        # zoom in to specific errors
        # if true_type in ('state') and print_count <= 6:
        #     print_count += 1
        #     print(f'Expected "{true_type}" but predicted "{predicted_type}"')
        #     print(f'{data[idx]}\n')
        

mismatch_sherlock_class_count = Counter(mismatches_sherlock)
print(mismatch_sherlock_class_count.most_common()[:10])

mismatches_sato_class_count = Counter(mismatches_sato)
print(mismatches_sato_class_count.most_common()[:10])

[('name', 734), ('rank', 650), ('location', 560), ('position', 507), ('region', 505), ('artist', 456), ('description', 414), ('type', 389), ('team', 386), ('product', 376)]
[('club', 2322), ('product', 1556), ('owner', 1470), ('teamname', 1422), ('position', 1361), ('region', 1337), ('notes', 1325), ('company', 1208), ('county', 1193), ('class', 1188)]


In [16]:
mismatch_sherlock_freq_df = pd.DataFrame(mismatches_sherlock, columns=['type'])
mismatch_sherlock_freq_df = pd.DataFrame(mismatch_sherlock_freq_df['type'].value_counts())
mismatch_sherlock_freq_df.columns = ['count']
mismatch_sherlock_freq_df.index.name = 'type'

mismatch_sato_freq_df = pd.DataFrame(mismatches_sato, columns=['type'])
mismatch_sato_freq_df = pd.DataFrame(mismatch_sato_freq_df['type'].value_counts())
mismatch_sato_freq_df.columns = ['count']
mismatch_sato_freq_df.index.name = 'type'

type_freq_df = pd.DataFrame(true_types, columns=['type'])
type_freq_df = pd.DataFrame(type_freq_df['type'].value_counts())
type_freq_df.columns = ['count']
type_freq_df.index.name = 'type'

In [17]:
combined_mismatch_freq_df = pd.merge(mismatch_sherlock_freq_df, mismatch_sato_freq_df, left_index=True, right_index=True, how='outer')
combined_mismatch_freq_df = pd.merge(combined_mismatch_freq_df, type_freq_df, left_index=True, right_index=True, how='outer')
combined_mismatch_freq_df.columns=['sherlock_mismatch_freq', 'sato_mismatch_freq', 'true_type_freq']

In [18]:
alt.Chart(combined_mismatch_freq_df.reset_index()).transform_fold(
      ['sherlock_mismatch_freq', 'sato_mismatch_freq', 'true_type_freq'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="sato_mismatch_freq",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [19]:
# print(combined_mismatch_freq_df)

In [20]:
alt.Chart(combined_report_df.reset_index()).transform_fold(
      ['precision_sato', 'recall_sato', 'f1-score_sato', 'precision_sherlock', 'recall_sherlock', 'f1-score_sherlock'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="support_sherlock",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [25]:
sato_top_5_df = report_sato_df.loc[report_sato_df['support']>0].sort_values(by=['f1-score'], ascending=False).head(5)
sato_top_5_df.index.name = 'type'
sato_bottom_5_df = report_sato_df.loc[report_sato_df['support']>0].sort_values(by=['f1-score'], ascending=False).tail(5)
sato_bottom_5_df.index.name = 'type'
sherlock_top_5_df = report_sherlock_df.loc[report_sherlock_df['support']>0].sort_values(by=['f1-score'], ascending=False).head(5)
sherlock_top_5_df.index.name = 'type'
sherlock_bottom_5_df = report_sherlock_df.loc[report_sherlock_df['support']>0].sort_values(by=['f1-score'], ascending=False).tail(5)
sherlock_bottom_5_df.index.name = 'type'

In [26]:
print(sato_top_5_df)
print(sato_bottom_5_df)
print(sherlock_top_5_df)
print(sherlock_bottom_5_df)

          precision    recall  f1-score  support
type                                            
isbn       0.998521  0.937500  0.967049     1440
grades     0.993321  0.939690  0.965762     1741
industry   0.926480  0.925276  0.925878     3078
format     0.960454  0.831807  0.891514     2949
result     0.866119  0.894651  0.880154     3066
             precision    recall  f1-score  support
type                                               
brand         0.148148  0.044037  0.067893      545
ranking       0.325581  0.031461  0.057377      445
nationality   0.520000  0.026804  0.050980      485
director      0.117647  0.024390  0.040404      246
sales         0.166667  0.003390  0.006645      295
           precision    recall  f1-score  support
type                                             
isbn        0.992350  0.990972  0.991661     1440
grades      0.992490  0.986789  0.989631     1741
jockey      0.986229  0.991480  0.988848     2817
industry    0.987533  0.977908  0.982697   

In [64]:
# altair chart representing the top 5 for both sato and sherlock
alt.Chart(sato_top_5_df.reset_index()).transform_fold(
        ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).configure_header(titleFontSize=18,labelFontSize=18).configure_legend(
    labelFontSize=18
)

In [65]:
alt.Chart(sherlock_top_5_df.reset_index()).transform_fold(
        ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).configure_header(titleFontSize=18,labelFontSize=18).configure_legend(
    labelFontSize=18
)

In [66]:
alt.Chart(sato_bottom_5_df.reset_index()).transform_fold(
        ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).configure_header(titleFontSize=18,labelFontSize=18).configure_legend(
    labelFontSize=18
)

In [67]:
alt.Chart(sherlock_bottom_5_df.reset_index()).transform_fold(
        ['precision', 'recall', 'f1-score'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="f1-score",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).configure_header(titleFontSize=18,labelFontSize=18).configure_legend(
    labelFontSize=18
)

In [1]:
# Correlation between support and f1-score
corr_data = report_sherlock_df[['f1-score', 'support']]
corr_data.reset_index(inplace=True)
print(corr_data)
worst_5 = list(sherlock_bottom_5_df.index.values)
top_5 = list(sherlock_top_5_df.index.values)
alt.Chart(corr_data).mark_circle(size=60).encode(
    x='support',
    y='recall',
    color=alt.condition(
        alt.Predicate(alt.FieldOneOfPredicate(field='index', oneOf=worst_5)),
        alt.value('red'),
        alt.value('steelblue')
    ),
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
)

NameError: name 'report_sherlock_df' is not defined