In [None]:
# Third meeting on sherlock
# Further analysing the gittabels benchmark dataset and comparing it to sherlock dataset

In [1]:
#Reload modules before executing code
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from os.path import join
from os import listdir
import json
import re #for camel case conversion
from collections import Counter

import numpy as np
import pandas as pd
from pyarrow.parquet import ParquetFile
import sklearn.metrics as skl

from sherlock import helpers
from sherlock.deploy.model import SherlockModel
# from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
#     convert_string_lists_to_lists,
    prepare_feature_extraction,
#     load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

import altair as alt
alt.renderers.enable('default')

RendererRegistry.enable('default')

### Utils

In [3]:
def camel_case(s):
    s = re.sub(r"(_|-)+", " ", s).title().replace(" ", "")
    s = ''.join([s[0].lower(), s[1:]])
    return s

In [4]:
def sherlock_case(s):
    s = re.sub(r"(_|-)+", " ", s).title().replace(" ", "")
    s = ''.join([s[0].lower(), s[1:]])
    s = ''.join(map(lambda x: x if x.islower() else " "+x, s))
    return s

In [5]:
def pretty_print_classification_report(inp_dict):
    print(f"\t\tprecision\trecall\t\tf1-score\tsupport")
    for key, value in inp_dict:
        if len(key) >= 8:
            tabs = '\t' * 1
        else:
            tabs = '\t' * 2

        print(f"{key}{tabs}{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['f1-score']:.3f}\t\t{value['support']}")

## Loading the annotation data
##### Make sure to unzip tables.zip in the /data/data/gittables_benchmark directory before proceeding

### Load Gittables

In [6]:
path = '../data/data/gittables_benchmark'

types = np.load(f"../model_files/classes_sherlock.npy", allow_pickle=True)
# print(types)
# with open(join(path, '../../types.json'), 'r') as f:  
#         types = json.load(f)['type78']

dbpedia_gt = pd.read_csv(join(path, 'dbpedia_gt.csv'))
schema_gt = pd.read_csv(join(path, 'schema_gt.csv'))

### Load sherlock test data

In [7]:
start = datetime.now()
print(f'Started at {start}')

columns_test_sherlock = pd.read_parquet("../data/data/raw/test_values.parquet")
feature_vectors_test_sherlock = pd.read_parquet('../data/data/processed/test.parquet')
col_true_types_test_sherlock = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()
col_true_types_test_sherlock = np.array([sherlock_case(x) for x in col_true_types_test_sherlock])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-03-22 16:30:33.190410
Finished at 2022-03-22 16:30:36.183068, took 0:00:02.992722 seconds


### Combining the gittables dataset columns with their true type

In [8]:
#TODO: columns with null or Nan values? -> median/avg/drop?
columns = []
col_true_types = []
col_ids = []
filepaths = [join(path, 'tables/', f) for f in listdir(join(path, 'tables/')) if f.endswith('.csv')]

#Go over the tables in the dataset
for fp in filepaths:
    table_id = fp[fp.rfind('/')+1:-4]
    table_df = pd.read_csv(fp)
    dbpedia_table_gts = dbpedia_gt[dbpedia_gt['table_id'].str.contains(table_id)]
    dbpedia_table_gts = dbpedia_table_gts[['table_id', 'target_column', 'annotation_label']]
    dbpedia_table_gts['table_id'] = dbpedia_table_gts['table_id'].apply(lambda x: table_id)
    schema_table_gts = schema_gt[schema_gt['table_id'].str.contains(table_id)]
    schema_table_gts = schema_table_gts[['table_id', 'target_column', 'annotation_label']]
    schema_table_gts['table_id'] = schema_table_gts['table_id'].apply(lambda x: table_id)
    table_gts = pd.merge(dbpedia_table_gts, schema_table_gts, how='outer')
    
    for index, gt_row in table_gts.iterrows():
        col_type = sherlock_case(gt_row['annotation_label'])
        # We only want types that are used in sherlock
        if col_type not in types:
            continue
        col_name = 'col'+ str(gt_row['target_column'])
        col_id = table_id[table_id.rfind('_')+1:] + str(gt_row['target_column'])
        col = table_df[col_name].convert_dtypes()
#         col_no_na = table_df_no_na[col_name]
        
        if pd.isnull(col).sum() > 0: #Base f1 = 0.31245450119322055
            #count returns the amount of valid values in the column, if column does
            #not contain any valid values, there is no point in trying to predict/use the column
            if col.count() == 0: # 0.35952832125142853
                continue
#             if col_type in ('year'):
#                 print(f'col of type {col_type} contains invalid values, file name: {fp}, col: {col_name}')
#                 print(col.astype(object).fillna(''))
                
        columns.append(list(map(str, col.astype(object).fillna('').to_list()))) #columns need to be converted to list of strings for preprocessing later on
        col_true_types.append(col_type)
        col_ids.append(col_id)   

## Feature extraction

### Init feature extraction models

In [9]:
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:03.885127 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:02.961959 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.132264 seconds.


[nltk_data] Downloading package punkt to /home/senn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/senn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
data = pd.Series(
    [
        *columns #expand/unpack the list
    ],
    name="values"
)

In [11]:
# data.head(5)

In [12]:
extract_features(
    join(path, "processed/temporary.csv"),
    data
)
feature_vectors = pd.read_csv(join(path, "processed/temporary.csv"), dtype=np.float32)

Extracting Features:   1%|▉                                                                                                  | 8/813 [00:00<00:13, 58.02it/s]

Exporting 1588 column features


Extracting Features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 813/813 [00:03<00:00, 206.00it/s]


In [13]:
# feature_vectors.head()

### Initialize Sherlock

In [14]:
model = SherlockModel();
model.initialize_model_from_json(with_weights=True, model_id="sherlock");

W0322 16:30:55.352770 140268770076480 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0322 16:30:55.354014 140268770076480 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0322 16:30:55.356514 140268770076480 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Varia

In [15]:
predicted_labels_gittables = model.predict(feature_vectors, "sherlock")
predicted_labels_test_sherlock = model.predict(feature_vectors_test_sherlock, "sherlock")

In [16]:
# predicted_labels_gittables

### Looking at the data

In [17]:
type_freq_df = pd.DataFrame(col_true_types, columns=['type'])
type_freq_df = pd.DataFrame(type_freq_df['type'].value_counts())
type_freq_df.columns = ['count']
type_freq_df.index.name = 'type'

In [18]:
alt.Chart(type_freq_df.reset_index()).mark_bar(size=15).encode(
    x = alt.X('type:O',
              title = 'Semantic Types',
              sort=alt.EncodingSortField(
                field="count",  
                order="descending")),
    y = alt.Y('count', title='Number of Samples')    
)

## Analysing results

In [19]:
prediction_size = len(predicted_labels_gittables)
print(f'prediction count {prediction_size}')

f1_skl = skl.f1_score(col_true_types, predicted_labels_gittables, average="weighted")
f1_skl_sherlock = skl.f1_score(col_true_types_test_sherlock, predicted_labels_test_sherlock, average="weighted")
print(f1_skl)
print(f1_skl_sherlock)

prediction count 813
0.3559554669926897
0.8951410029373902


### All Scores

In [20]:
# precision -> true positives / selected elements
# recall -> true positives / relevant elements
predicted_labels_gittables_set = set(predicted_labels_gittables)
col_true_types_set = set(col_true_types)
# col_true_types_test_sherlock_set = set(col_true_types_test_sherlock)
predicted_labels_no_support = predicted_labels_gittables_set.difference(col_true_types_set)
non_predicted_labels = col_true_types_set.difference(predicted_labels_gittables_set)
print(f'Not supported: {predicted_labels_no_support}\n')
print(f'Not predicted: {non_predicted_labels}\n')
# print(col_true_types_test_sherlock_set)
# filtered = list(filter(lambda x: x == 'album', col_true_types))
# print(len(filtered))
# print(skl.classification_report(col_true_types, predicted_labels_gittables, digits=5))

Not supported: {'artist', 'sex', 'creator', 'area', 'position', 'service', 'organisation', 'requirement', 'birth Date', 'result', 'album', 'plays', 'day', 'format', 'person', 'sales', 'command', 'club', 'symbol'}

Not predicted: {'depth', 'family', 'collection', 'capacity', 'county', 'duration'}



In [21]:
model_id = "sherlock"

report_sherlock = skl.classification_report(col_true_types_test_sherlock, predicted_labels_test_sherlock, output_dict=True)
report_sherlock_df_input = {k: list(v.values()) for k, v in list(report_sherlock.items())[:-3]} #last 3 are total f1/macro/weigthed
report_sherlock_df = pd.DataFrame.from_dict(report_sherlock_df_input, orient='index', columns=['precision', 'recall', 'f1-score', 'support']).sort_values(by='f1-score', ascending = False)

class_scores_sherlock = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in types, list(report_sherlock.items())))

report = skl.classification_report(col_true_types, predicted_labels_gittables, output_dict=True, zero_division=0)
report_df_input = {k: list(v.values()) for k, v in list(report.items())[:-3]} #last 3 are total f1/macro/weigthed
report_df = pd.DataFrame.from_dict(report_df_input, orient='index', columns=['precision', 'recall', 'f1-score', 'support']).sort_values(by='f1-score', ascending = False)
# print(report_df)
# combined = pd.merge(type_freq_df, class_scores_df, left_index=True, right_index=True, how='outer').sort_values(by='count', ascending = False)
# print(combined)

class_scores_all = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in types, list(report.items())))
#first sort by f1-score, then by support
class_scores_all = sorted(class_scores_all, key=lambda item: (item[1]['f1-score'],item[1]['support']), reverse=True)
class_scores_supported = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in types and x[1]['support'] != 0, list(report.items())))
#sort only by f1
class_scores_supported = sorted(class_scores_supported, key=lambda item: item[1]['f1-score'], reverse=True)
# print(class_scores_supported)

combined_report_df = pd.merge(report_df, report_sherlock_df, left_index=True, right_index=True)
combined_report_df.index.names = ['type']

In [22]:
pretty_print_classification_report(class_scores_supported) #class_scores_all 
# address -> low precision/high recall -> the one real adress column gets predicted correctly, but there
# are also a lot of columns which get predicted with the type adress but have another ground truth type
# name,species -> name has a high precision and low recall mainly because, which we will see later on in
# the notebook, when comparing the columns with ground truth name and their predicted type. Many get 
# predicted as species which seems to be correct and indicates an incorrect ground truth

		precision	recall		f1-score	support
year		1.000		0.912		0.954		102
city		0.833		1.000		0.909		5
country		0.750		0.750		0.750		4
type		0.918		0.618		0.739		144
status		0.588		0.667		0.625		15
language	0.667		0.500		0.571		4
company		0.333		1.000		0.500		2
classification	0.250		1.000		0.400		1
description	0.414		0.343		0.375		35
code		0.800		0.235		0.364		17
weight		0.167		0.667		0.267		3
name		0.800		0.076		0.139		210
species		0.072		0.519		0.127		27
class		0.235		0.062		0.099		64
notes		0.067		0.167		0.095		6
category	0.050		0.200		0.080		5
rank		0.167		0.030		0.051		100
address		0.007		1.000		0.014		1
age		0.000		0.000		0.000		1
capacity	0.000		0.000		0.000		1
collection	0.000		0.000		0.000		1
component	0.000		0.000		0.000		3
county		0.000		0.000		0.000		1
depth		0.000		0.000		0.000		13
duration	0.000		0.000		0.000		11
family		0.000		0.000		0.000		2
gender		0.000		0.000		0.000		1
location	0.000		0.000		0.000		2
manufacturer	0.000		0.000		0.000		2
operator	0.000		0.000		0.000		1
order

In [23]:
combined_report_df

Unnamed: 0_level_0,precision_x,recall_x,f1-score_x,support_x,precision_y,recall_y,f1-score_y,support_y
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
year,1.0,0.911765,0.953846,102,0.964505,0.937313,0.950715,3015
city,0.833333,1.0,0.909091,5,0.86368,0.903574,0.883177,2966
country,0.75,0.75,0.75,4,0.892393,0.949967,0.920281,3038
type,0.917526,0.618056,0.738589,144,0.916487,0.875215,0.895375,2909
status,0.588235,0.666667,0.625,15,0.942839,0.936452,0.939634,3100
language,0.666667,0.5,0.571429,4,0.938544,0.953189,0.945809,1474
company,0.333333,1.0,0.5,2,0.911576,0.888195,0.899734,3041
classification,0.25,1.0,0.4,1,0.92674,0.86201,0.893204,587
description,0.413793,0.342857,0.375,35,0.803528,0.868508,0.834755,3042
code,0.8,0.235294,0.363636,17,0.916297,0.907307,0.91178,2956


In [24]:
# combined_report_df = pd.merge(report_df, report_sherlock_df, left_index=True, right_index=True, how='outer')
total = len(col_true_types)
combined_report_df['support_x'] = combined_report_df['support_x'].apply(lambda x: x/total)
total =len(col_true_types_test_sherlock)
combined_report_df['support_y'] = combined_report_df['support_y'].apply(lambda x: x/total)

In [25]:
alt.Chart(combined_report_df.reset_index()).transform_fold(
      ['support_x', 'support_y'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="support_x",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

### Verifying f1-scores

In [26]:
f1_macro_all = sum(map(lambda x: x[1]['f1-score'], class_scores_all))/len(class_scores_all)
f1_macro = sum(map(lambda x: x[1]['f1-score'], class_scores_supported))/len(class_scores_supported)
f1_macro_skl = skl.f1_score(col_true_types, predicted_labels_gittables, average='macro')
f1_weighted_all = sum(map(lambda x: x[1]['f1-score'] * x[1]['support'], class_scores_all))/sum(map(lambda x: x[1]['support'], class_scores_all))
f1_weighted = sum(map(lambda x: x[1]['f1-score'] * x[1]['support'], class_scores_supported))/sum(map(lambda x: x[1]['support'], class_scores_supported))
f1_weighted_skl = skl.f1_score(col_true_types, predicted_labels_gittables, average='weighted')
f1_dict = {
    'all': {'macro': f1_macro_all, 'weighted': f1_weighted_all}, 
    'supported': {'macro':f1_macro, 'weighted':f1_weighted}, 
    'sklearn': {'macro': f1_macro_skl, 'weighted':f1_weighted_skl}
}

print(f"\t\tmacro\t\tweighted")
for k,v in f1_dict.items():
    if len(k) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{k}{tabs}{v['macro']:.5f}\t\t{v['weighted']:.5f}")

# print(f'macro f1 all: {f1_macro_all}')
# print(f'weighted f1 all: {f1_weighted_all}')
# print(f'macro f1 types w/ support>0: {f1_macro}')
# print(f'weighted f1 types w/ support>0: {f1_weighted}')
# print(f'macro f1 sklearn: {f1_macro_skl}')
# print(f'weighted f1 sklearn: {f1_weighted_skl}')

		macro		weighted
all		0.12604		0.35596
supported	0.19076		0.35596
sklearn		0.12604		0.35596


### Top 5 types

In [27]:
pretty_print_classification_report(class_scores_supported[0:5])

		precision	recall		f1-score	support
year		1.000		0.912		0.954		102
city		0.833		1.000		0.909		5
country		0.750		0.750		0.750		4
type		0.918		0.618		0.739		144
status		0.588		0.667		0.625		15


### Bottom 5 types

In [28]:
pretty_print_classification_report(class_scores_supported[len(class_scores_supported)-5:len(class_scores_supported)])

		precision	recall		f1-score	support
product		0.000		0.000		0.000		3
range		0.000		0.000		0.000		1
region		0.000		0.000		0.000		1
state		0.000		0.000		0.000		20
team		0.000		0.000		0.000		2


### Review faulty predictions

In [29]:
mismatches = list()
mismatches_col_idx = list()
print_count = 0
for idx, true_type in enumerate(col_true_types):
    predicted_type = predicted_labels_gittables[idx]

    if true_type != predicted_type:
        mismatches.append(true_type)
        mismatches_col_idx.append(idx)
        
        # zoom in to specific errors
        if true_type in ('state') and print_count <= 6:
            print_count += 1
            print(f'Expected "{true_type}" but predicted "{predicted_type}"')
            print(f'{data[idx]}\n')
        
print(f'Total mismatches: {len(mismatches)}')

mismatch_class_count = Counter(mismatches)
mismatch_class_count.most_common()[:10]
# Name, Rank? clean true types? 
# rank, state -> many reoccuring wrong column predictions, can be explained by column embedding?
# Class, type? Like they mention in the gittables paper, mismatching can be due to content of web-tables vs. data-base-like-tables
# The difference in the type of data contained in the gittables dataset and the webtables could explain the faulty predictions
# like we see with state, class, type -> there is a clear difference in values that represent a state in the gittabels and the webtables

Expected "state" but predicted "category"
['Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Dynamic', 'Static', 'Dynamic', 'Dynamic']

Expected "state" but predicted "category"
['Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Dynamic', 'Dynamic']

Expected "state" but predicted "category"
['Static', 'Static', 'Static', 'Static', 'Dynamic', 'Static', 'Static', 'Static', 'Static', 'Static', 'Dynamic']

Expected "state" but predicted "category"
['Static', 'Static', 'Static', 'Static', 'Dynamic', 'Dynamic', 'Static', 'Static', 'Static', 'Dynamic']

Expected "state" but predicted "type"
['Static', 'Static', 'Static', 'Static', 'Dynamic', 'Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Static', 'Dynamic', 'Static', 'Dynamic']

Expected "state" but predicted "category"
['Static', 'Static', 'Static', 'Static', 'Static', 'Dynamic', 'Static', 'Dynamic']

Expected "state" but predicted "plays"
['9', 

[('name', 194),
 ('rank', 97),
 ('class', 60),
 ('type', 55),
 ('description', 23),
 ('state', 20),
 ('code', 13),
 ('species', 13),
 ('depth', 13),
 ('duration', 11)]

In [30]:
mismatch_freq_df = pd.DataFrame(mismatches, columns=['type'])
mismatch_freq_df = pd.DataFrame(mismatch_freq_df['type'].value_counts())
mismatch_freq_df.columns = ['count']
mismatch_freq_df.index.name = 'type'

# alt.Chart(mismatch_freq_df.reset_index()).mark_bar(size=15).encode(
#     x = alt.X('type:O',
#               title = 'Semantic Types',
#               sort=alt.EncodingSortField(
#                 field="count",  
#                 order="descending")),
#     y = alt.Y('count', title='Number of Samples')    
# )

In [31]:
combined_freq_df = pd.merge(type_freq_df, mismatch_freq_df, left_index=True, right_index=True)
combined_freq_df.columns=['type_freq', 'mismatch_freq']
# print(combined_freq_df)

In [32]:
alt.Chart(combined_freq_df.reset_index()).transform_fold(
      ['type_freq', 'mismatch_freq'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="mismatch_freq",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)

In [33]:
# alt.Chart(combined_freq_df.reset_index()).transform_fold(
#   ['type_freq', 'mismatch_freq'],
#   as_=['column', 'value']
# ).mark_bar(size=15).encode(
#     x = alt.X('type:O',
#               title = 'Semantic Types',
#               sort=alt.EncodingSortField(
#                 field="type_freq",  
#                 order="descending")),
#     y = alt.Y('value:Q', title='Number of Samples'),
#       color='column:N'
# ).properties(width=600,height=200)

### Comparing training data to gittables
#### To better understand faulty predictions

In [None]:
columns_train_sherlock = pd.read_parquet("../data/data/raw/train_values.parquet")
columns_train_sherlock_list = list(columns_train_sherlock['values'])
col_true_types_train_sherlock = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()
col_true_types_train_sherlock = np.array([sherlock_case(x) for x in col_true_types_train_sherlock])

In [None]:
mismatch_idx_df_data = {'idx': mismatches_col_idx, 'type': mismatches}
mismatch_idx_df = pd.DataFrame(mismatch_idx_df_data).groupby('type')

In [None]:
# columns_train_sherlock_list

In [None]:
compare_type = 'name'
compare_type_mismatch_idxs = mismatch_idx_df.get_group(compare_type)['idx'].to_list()
compare_type_sherlock_train_idxs = [idx for idx,_type in enumerate(col_true_types_train_sherlock) if _type == compare_type]

sample_size = 5
sample_compare_type_mismatch_idxs=np.random.choice(compare_type_mismatch_idxs, size=sample_size, replace=False)
sample_compare_type_sherlock_train_idxs=np.random.choice(compare_type_sherlock_train_idxs, size=sample_size, replace=False)
for i in range(0,sample_size):
    print(f'Gittable sample:')
    print(f'{columns[sample_compare_type_mismatch_idxs[i]]}\n')
    print(f'Sherlock sample:')
    print(f'{columns_train_sherlock_list[sample_compare_type_sherlock_train_idxs[i]]}\n')
    print('---------------------------------------------------------------------------\n')

### Comparing frequencies in gittables and sherlock training data

In [None]:
sherlock_train_freq_df = pd.DataFrame(col_true_types_train_sherlock, columns=['type'])
sherlock_train_freq_df = pd.DataFrame(sherlock_train_freq_df['type'].value_counts())
sherlock_train_freq_df.columns = ['freq_sherlock']
sherlock_train_freq_df.index.name = 'type'
total = len(col_true_types_train_sherlock)
sherlock_train_freq_df['freq_sherlock'] = sherlock_train_freq_df['freq_sherlock'].apply(lambda x: x/total)

gittables_freq_df = type_freq_df
gittables_freq_df.columns = ['freq_gittables']
total = len(col_true_types)
gittables_freq_df['freq_gittables'] = gittables_freq_df['freq_gittables'].apply(lambda x: x/total)

combined_freq_df = pd.merge(gittables_freq_df, sherlock_train_freq_df, left_index=True, right_index=True)
combined_freq_df.index.names = ['type']

In [None]:
alt.Chart(combined_freq_df.reset_index()).transform_fold(
      ['freq_gittables', 'freq_sherlock'],
      as_=['column', 'value']
    ).mark_bar(size=15).encode(
    column=alt.Column('type:O', sort=alt.EncodingSortField(
                field="freq_gittables",  
                order="descending")),
    x = alt.X('column:N',
              title = '',
              ),
    y=alt.Y('value:Q'),
    color='column:N'
)
# As with the earlier frequency comparison of the predicted types of the gittables and test data
# these can give us useful insights but they can not justify
# the low f1 scores, if we saw a slightly lower f1 score this could be due to the fact that there is
# a high frequency of for example numeric types, which are a weak type for sherlock, but this is not
# the case in the gittables example.