In [1]:
# Notebook to extract, train and predict using entire gittables dataset
# Due to questionable ground truth of the gittables dataset we decided to not finish this notebook
# Although it is an interesting subject for further research to see if training on specific 'unknown' data source
# results in an improvement on type of data provided by the unknown source

In [2]:
#Reload modules before executing code
%load_ext autoreload
%autoreload 2

In [3]:
from datetime import datetime
from os.path import join
from os import listdir
import json
from re import sub #for camel case conversion

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

from sherlock import helpers
from sherlock.deploy.model import SherlockModel
# from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
#     convert_string_lists_to_lists,
    prepare_feature_extraction,
#     load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

import altair as alt
alt.renderers.enable('default')

RendererRegistry.enable('default')

### Utils

In [4]:
def camel_case(s):
    try:
      c = sub(r"(_|-)+", " ", s).title().replace(" ", "")
      return ''.join([c[0].lower(), c[1:]])
    except:
#         print(f'Error converting {s} to camel case')
        return ''

def sherlock_case(s):
    s = sub(r"(_|-)+", " ", s).title().replace(" ", "")
    s = ''.join([s[0].lower(), s[1:]])
    s = ''.join(map(lambda x: x if x.islower() else " "+x, s))
    return s

In [5]:
sherlock_case('test sherlock_case')

'test Sherlock Case'

In [6]:
# !! Make sure you downloaded a gittables dataset (f.e. abstraction_tables)
# and unzipped the data into the directory mentioned below !!
path = '../data/data/gittables/whole_tables'

types = np.load(f"../model_files/classes_sherlock.npy", allow_pickle=True)
print(types)

filepaths = [join(path, f) for f in listdir(path) if f.endswith('.parquet')]

['address' 'affiliate' 'affiliation' 'age' 'album' 'area' 'artist'
 'birth Date' 'birth Place' 'brand' 'capacity' 'category' 'city' 'class'
 'classification' 'club' 'code' 'collection' 'command' 'company'
 'component' 'continent' 'country' 'county' 'creator' 'credit' 'currency'
 'day' 'depth' 'description' 'director' 'duration' 'education' 'elevation'
 'family' 'file Size' 'format' 'gender' 'genre' 'grades' 'industry' 'isbn'
 'jockey' 'language' 'location' 'manufacturer' 'name' 'nationality'
 'notes' 'operator' 'order' 'organisation' 'origin' 'owner' 'person'
 'plays' 'position' 'product' 'publisher' 'range' 'rank' 'ranking'
 'region' 'religion' 'requirement' 'result' 'sales' 'service' 'sex'
 'species' 'state' 'status' 'symbol' 'team' 'team Name' 'type' 'weight'
 'year']


In [7]:
# filepaths
# types

In [8]:
columns = []
col_true_types = []
col_ids = []
curr_id = 0
start = datetime.now()
print(f'Started at {start}')

for fp in filepaths:
    try:
        table = pd.read_parquet(fp)
        table.rename(columns=sherlock_case,inplace=True)
    except:
        print(f'Error converting table in {fp}')
        continue
    for idx, col_name in enumerate(table.columns):
        if len(col_name) == 0: #no type defined, skip col
            continue
        if col_name not in types:
            continue
        col = table.iloc[:, idx]
        if col.count() == 0: # 0.35952832125142853
                continue
        try: #TODO fix 2 columns in table with same name
            columns.append(list(map(str, col.to_list())))
            col_true_types.append(col_name)
            col_ids.append(curr_id)
            curr_id+=1
        except:
            print(f'error in table {fp}')
        
        
print(f'Finished loading tables at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-03-29 11:00:43.438422
Error converting table in ../data/data/gittables/whole_tables/00_beastman_1.parquet
Error converting table in ../data/data/gittables/whole_tables/f_marq_x.parquet
Error converting table in ../data/data/gittables/whole_tables/f_InnaRuiz.parquet
Error converting table in ../data/data/gittables/whole_tables/comp.sys.ibm.pc.hardware2.parquet
Error converting table in ../data/data/gittables/whole_tables/f_jimimc_.parquet
Error converting table in ../data/data/gittables/whole_tables/f_katipose.parquet
Error converting table in ../data/data/gittables/whole_tables/permit-2011-2552_formatted.parquet
Error converting table in ../data/data/gittables/whole_tables/f_whoistorial.parquet
Error converting table in ../data/data/gittables/whole_tables/summary_cTAS_43.parquet
Error converting table in ../data/data/gittables/whole_tables/talk.religion.misc.parquet
Error converting table in ../data/data/gittables/whole_tables/f_HasselmannL.parquet
Error converting tabl

Error converting table in ../data/data/gittables/whole_tables/Aneri_2.parquet
Error converting table in ../data/data/gittables/whole_tables/f_meemsdreams.parquet
Error converting table in ../data/data/gittables/whole_tables/splits_TK.parquet
Error converting table in ../data/data/gittables/whole_tables/f_davepennock.parquet
Error converting table in ../data/data/gittables/whole_tables/Magic(clean).parquet
Error converting table in ../data/data/gittables/whole_tables/melon_5.parquet
Error converting table in ../data/data/gittables/whole_tables/permit-2017-2172_formatted.parquet
Error converting table in ../data/data/gittables/whole_tables/meta_20.parquet
Error converting table in ../data/data/gittables/whole_tables/permit-2012-198_formatted.parquet
Error converting table in ../data/data/gittables/whole_tables/provogue_4.parquet
Error converting table in ../data/data/gittables/whole_tables/permit-2005-1801_formatted_1.parquet
Error converting table in ../data/data/gittables/whole_tables/

Error converting table in ../data/data/gittables/whole_tables/f_elloitsella.parquet
Error converting table in ../data/data/gittables/whole_tables/2119_Elfstone_3.parquet
Error converting table in ../data/data/gittables/whole_tables/f_shopbestmovies.parquet
Error converting table in ../data/data/gittables/whole_tables/L_2128.parquet
Error converting table in ../data/data/gittables/whole_tables/f_joe_trrohman.parquet
Error converting table in ../data/data/gittables/whole_tables/Post771.parquet
Error converting table in ../data/data/gittables/whole_tables/00_nehekharan.parquet
Error converting table in ../data/data/gittables/whole_tables/rec.sport.baseball.parquet
Error converting table in ../data/data/gittables/whole_tables/ETERNAL_CLEMENTINE.parquet
Error converting table in ../data/data/gittables/whole_tables/911_12.parquet
Error converting table in ../data/data/gittables/whole_tables/Misanthreville.parquet
Error converting table in ../data/data/gittables/whole_tables/2020-11-01.parque

Error converting table in ../data/data/gittables/whole_tables/f_son_like_father.parquet
Error converting table in ../data/data/gittables/whole_tables/f_e_sprad.parquet
Error converting table in ../data/data/gittables/whole_tables/summary_cTAS_73.parquet
Error converting table in ../data/data/gittables/whole_tables/f_SaraRtweetsEd.parquet
Error converting table in ../data/data/gittables/whole_tables/reviews_375.parquet
Error converting table in ../data/data/gittables/whole_tables/911_10.parquet
Error converting table in ../data/data/gittables/whole_tables/XxVirtueIsRelativexX.parquet
Error converting table in ../data/data/gittables/whole_tables/f_rohmontgomery.parquet
Error converting table in ../data/data/gittables/whole_tables/2017-03-04-10-20-37-survey_export_data_20160712054321.parquet
Error converting table in ../data/data/gittables/whole_tables/f_ButtlerLaura.parquet
Error converting table in ../data/data/gittables/whole_tables/comp.sys.mac.hardware2.parquet
Error converting table

Error converting table in ../data/data/gittables/whole_tables/dog%20eat%20dog.parquet
Error converting table in ../data/data/gittables/whole_tables/f_RuroniSai.parquet
Error converting table in ../data/data/gittables/whole_tables/f_CodyMJohns.parquet
Error converting table in ../data/data/gittables/whole_tables/Escherichia_fergusonii_summary_4.parquet
Error converting table in ../data/data/gittables/whole_tables/sweden.parquet
Error converting table in ../data/data/gittables/whole_tables/permit-2011-2541_formatted.parquet
Error converting table in ../data/data/gittables/whole_tables/K.parquet
Error converting table in ../data/data/gittables/whole_tables/Serratia_plymuthica_summary_2.parquet
Error converting table in ../data/data/gittables/whole_tables/f_dachelrenaro.parquet
Error converting table in ../data/data/gittables/whole_tables/Enterobacter_cloacae_summary_1.parquet
Error converting table in ../data/data/gittables/whole_tables/summary_cTAS_72.parquet
Error converting table in ..

Error converting table in ../data/data/gittables/whole_tables/srhkhavari.parquet
Error converting table in ../data/data/gittables/whole_tables/travel_history_4.parquet
Error converting table in ../data/data/gittables/whole_tables/splits_MOM.parquet
Error converting table in ../data/data/gittables/whole_tables/913_12.parquet
Error converting table in ../data/data/gittables/whole_tables/Salmonella_enterica_summary.parquet
Error converting table in ../data/data/gittables/whole_tables/summary_cTAS_122.parquet
Error converting table in ../data/data/gittables/whole_tables/test_stream8.parquet
Error converting table in ../data/data/gittables/whole_tables/f_MorganPrewitt.parquet
Error converting table in ../data/data/gittables/whole_tables/talk.religion.misc2.parquet
Error converting table in ../data/data/gittables/whole_tables/K_830.parquet
Error converting table in ../data/data/gittables/whole_tables/Shigella_flexneri_summary_1.parquet
Error converting table in ../data/data/gittables/whole_t

Error converting table in ../data/data/gittables/whole_tables/jokes.aiml.parquet
Error converting table in ../data/data/gittables/whole_tables/f_tomsandpie.parquet
Error converting table in ../data/data/gittables/whole_tables/18_210.parquet
Error converting table in ../data/data/gittables/whole_tables/f_MattjewH.parquet
Error converting table in ../data/data/gittables/whole_tables/f_fredipersico.parquet
Error converting table in ../data/data/gittables/whole_tables/2827049638.audi_1.parquet
Error converting table in ../data/data/gittables/whole_tables/L%20CrooksAndLiars%20John%20Amato%20Traitor%20Trump%20Demands%20National%20Guard%20During%20AZs%20Third%20Recount.parquet
Error converting table in ../data/data/gittables/whole_tables/Yepme.parquet
Error converting table in ../data/data/gittables/whole_tables/916_14.parquet
Error converting table in ../data/data/gittables/whole_tables/travel_history_6.parquet
Error converting table in ../data/data/gittables/whole_tables/D_2905.parquet
Erro

Error converting table in ../data/data/gittables/whole_tables/extend_summary_37.parquet
Error converting table in ../data/data/gittables/whole_tables/f_BrittanyB_16.parquet
Error converting table in ../data/data/gittables/whole_tables/f_whorelie.parquet
Error converting table in ../data/data/gittables/whole_tables/227_Should%20I%20book%20_.parquet
Error converting table in ../data/data/gittables/whole_tables/f_AbaddonRampant.parquet
Error converting table in ../data/data/gittables/whole_tables/summary_cTAS_124.parquet
Error converting table in ../data/data/gittables/whole_tables/rec.sport.baseball2.parquet
Error converting table in ../data/data/gittables/whole_tables/f_SunflowahSoul.parquet
Error converting table in ../data/data/gittables/whole_tables/102_19.parquet
Error converting table in ../data/data/gittables/whole_tables/L%20DailyKos%20NotNowNotEverforGood%20News%20May%20the%20Farce%20Be%20with%20You%20-%20Good%20News%20Roundup%20for%20Bluesday%20May%20the%20Fourth.parquet
Error 

Error converting table in ../data/data/gittables/whole_tables/wholeDesign_pad.parquet
Error converting table in ../data/data/gittables/whole_tables/searchtext.parquet
Error converting table in ../data/data/gittables/whole_tables/measures.parquet
Error converting table in ../data/data/gittables/whole_tables/f_glihtterie.parquet
Error converting table in ../data/data/gittables/whole_tables/2997903160.ford_1.parquet
Error converting table in ../data/data/gittables/whole_tables/extend_summary_30.parquet
Error converting table in ../data/data/gittables/whole_tables/ttttt.parquet
Error converting table in ../data/data/gittables/whole_tables/913_10.parquet
Error converting table in ../data/data/gittables/whole_tables/summary_cTAS_13.parquet
Error converting table in ../data/data/gittables/whole_tables/908_17.parquet
Error converting table in ../data/data/gittables/whole_tables/f_um_joanna.parquet
Error converting table in ../data/data/gittables/whole_tables/Dickeya_zeae_summary_1.parquet
Erro

Finished loading tables at 2022-03-29 11:29:53.144600, took 0:29:09.706360 seconds


In [9]:
print(sorted(set(col_true_types)))
len(set(col_true_types))
classes = np.load(f"../model_files/classes_sherlock.npy", allow_pickle=True)
print(classes)

['address', 'affiliate', 'affiliation', 'age', 'album', 'area', 'artist', 'birth Date', 'brand', 'capacity', 'category', 'city', 'class', 'classification', 'club', 'code', 'collection', 'command', 'company', 'component', 'continent', 'country', 'county', 'creator', 'credit', 'currency', 'day', 'depth', 'description', 'director', 'duration', 'education', 'elevation', 'family', 'file Size', 'format', 'gender', 'genre', 'grades', 'industry', 'isbn', 'jockey', 'language', 'location', 'manufacturer', 'name', 'nationality', 'notes', 'operator', 'order', 'organisation', 'origin', 'owner', 'person', 'plays', 'position', 'product', 'publisher', 'range', 'rank', 'ranking', 'region', 'religion', 'requirement', 'result', 'sales', 'service', 'sex', 'species', 'state', 'status', 'symbol', 'team', 'team Name', 'type', 'weight', 'year']
['address' 'affiliate' 'affiliation' 'age' 'album' 'area' 'artist'
 'birth Date' 'birth Place' 'brand' 'capacity' 'category' 'city' 'class'
 'classification' 'club' 'c

In [10]:
len(col_true_types)

94324

## Train, test, validate split

In [11]:
true_types_train, true_types_test, cols_train, cols_test = train_test_split(col_true_types, columns, test_size=0.4)
true_types_test, true_types_validate, cols_test, cols_validate = train_test_split(true_types_test, cols_test, test_size=0.5)

true_types_train = np.array([x.lower() for x in true_types_train])
true_types_test = np.array([x.lower() for x in true_types_test])
true_types_validate = np.array([x.lower() for x in true_types_validate])

In [12]:
print(len(cols_train))
print(len(cols_test))
print(len(cols_validate))

56594
18865
18865


## Extract features

### Init feature extraction models

In [13]:
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:27.561204 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:47.700024 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.620850 seconds.


[nltk_data] Downloading package punkt to /home/senn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/senn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
data_train = pd.Series(
    [
        *cols_train
    ],
    name="values"
)
data_test = pd.Series(
    [
        *cols_test
    ],
    name="values"
)
data_validate = pd.Series(
    [
        *cols_validate
    ],
    name="values"
)

In [None]:
#TODO? makedirs(join(path, 'processed'), exist_ok=True)
extract_features(
    join(path, 'processed/temporary_train.csv'),
    data_train
)
feature_vectors_train = pd.read_csv(join(path, 'processed/temporary_train.csv'), dtype=np.float32)
extract_features(
    join(path, 'processed/temporary_test.csv'),
    data_test
)
feature_vectors_test = pd.read_csv(join(path, 'processed/temporary_test.csv'), dtype=np.float32)
extract_features(
    join(path, 'processed/temporary_validate.csv'),
    data_validate
)
feature_vectors_validate = pd.read_csv(join(path, 'processed/temporary_validate.csv'), dtype=np.float32)

Extracting Features:   0%|                                                     | 8/56594 [00:00<14:01, 67.26it/s]

Exporting 1588 column features


Extracting Features: 100%|█████████████████████████████████████████████████| 56594/56594 [16:30<00:00, 57.16it/s]
Extracting Features:   0%|                                                    | 12/18865 [00:00<03:11, 98.67it/s]

Exporting 1588 column features


Extracting Features: 100%|█████████████████████████████████████████████████| 18865/18865 [05:38<00:00, 55.74it/s]
Extracting Features:   0%|                                                    | 14/18865 [00:00<03:16, 96.10it/s]

Exporting 1588 column features


Extracting Features:  99%|████████████████████████████████████████████████▎| 18621/18865 [05:20<00:05, 42.50it/s]

In [None]:
# feature_vectors.head()

## Init/train sherlock

In [None]:
model_id = "retrained_sherlock_gittables"
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel()
# Model will be stored with ID `model_id`, IF TRAINED ONCE: COMMENT 4 NEXT LINES AND UNCOMMENT LAST LINE
model.fit(feature_vectors_train,true_types_train, feature_vectors_validate, true_types_validate, model_id=model_id)
print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')
model.store_weights(model_id=model_id)
# model.initialize_model_from_json(with_weights=True, model_id=model_id)

In [None]:
predicted_labels_test = model.predict(feature_vectors_test, model_id)

In [None]:
print(classification_report(true_types_test, predicted_labels_test))

### Analysing results

In [None]:
# size = len(predicted_labels)
# print(f'prediction count {size}')


# # Should be fully deterministic too.
# f1_score(col_types[:size], predicted_labels[:size], average="weighted")

### All Scores

In [None]:
# print(classification_report(col_types[:size], predicted_labels[:size], digits=4))

In [None]:
# model_id = "sherlock"

# classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

# report = classification_report(col_types[:size], predicted_labels[:size], output_dict=True)

# class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

# class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 types

In [None]:
# print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

# for key, value in class_scores[0:5]:
#     if len(key) >= 8:
#         tabs = '\t' * 1
#     else:
#         tabs = '\t' * 2

#     print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

In [None]:
### Bottom 5 types

In [None]:
# print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

# for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
#     if len(key) >= 8:
#         tabs = '\t' * 1
#     else:
#         tabs = '\t' * 2

#     print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

In [None]:
# type_freq_df = pd.DataFrame(col_types, columns=['type'])
# type_freq_df = pd.DataFrame(type_freq_df['type'].value_counts())
# type_freq_df.columns = ['count']
# type_freq_df.index.name = 'type'


In [None]:
# type_freq_df

In [None]:
# alt.Chart(type_freq_df.reset_index()).mark_bar(size=8).encode(
#     x = alt.X('type:O',
#               title = 'Semantic Types',
#               sort=alt.EncodingSortField(
#                 field="count",  
#                 order="descending")),
#     y = alt.Y('count', title='Number of Samples')    
# ).properties(width=800,height=200)