In [1]:
#fourth meeting sherlock

In [1]:
from datetime import datetime
from os.path import join
from os import listdir
import re #for camel case conversion

import collections
import numpy as np
import pandas as pd
from pyarrow.parquet import ParquetFile
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sherlock import helpers
from sherlock.deploy.model import SherlockModel
# from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
#     convert_string_lists_to_lists,
    prepare_feature_extraction,
#     load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

import altair as alt
alt.renderers.enable('default')

RendererRegistry.enable('default')

### Utils

In [2]:
def camel_case(s):
  s = re.sub(r"(_|-)+", " ", s).title().replace(" ", "")
  return ''.join([s[0].lower(), s[1:]])

def sherlock_case(s):
    s = re.sub(r"(_|-)+", " ", s).title().replace(" ", "")
    s = ''.join([s[0].lower(), s[1:]])
    s = ''.join(map(lambda x: x if x.islower() else " "+x, s))
    return s

In [3]:
def pretty_print_classification_report(inp_dict):
    print(f"\t\tprecision\trecall\t\tf1-score\tsupport")
    for key, value in inp_dict:
        if len(key) >= 8:
            tabs = '\t' * 1
        else:
            tabs = '\t' * 2

        print(f"{key}{tabs}{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['f1-score']:.3f}\t\t{value['support']}")

### Loading the dataset

In [25]:
# pick the path for the types you want to predict
# path = '../data/data/gittables_benchmark/non_reannotated'
path = '../data/data/gittables_benchmark/reannotated'
types = np.load(f"../model_files/classes_sherlock.npy", allow_pickle=True)
print(types)

['address' 'affiliate' 'affiliation' 'age' 'album' 'area' 'artist'
 'birth Date' 'birth Place' 'brand' 'capacity' 'category' 'city' 'class'
 'classification' 'club' 'code' 'collection' 'command' 'company'
 'component' 'continent' 'country' 'county' 'creator' 'credit' 'currency'
 'day' 'depth' 'description' 'director' 'duration' 'education' 'elevation'
 'family' 'file Size' 'format' 'gender' 'genre' 'grades' 'industry' 'isbn'
 'jockey' 'language' 'location' 'manufacturer' 'name' 'nationality'
 'notes' 'operator' 'order' 'organisation' 'origin' 'owner' 'person'
 'plays' 'position' 'product' 'publisher' 'range' 'rank' 'ranking'
 'region' 'religion' 'requirement' 'result' 'sales' 'service' 'sex'
 'species' 'state' 'status' 'symbol' 'team' 'team Name' 'type' 'weight'
 'year']


In [26]:
columns = []
col_true_types = []
# col_ids = []
filepaths = [join(path, f) for f in listdir(path) if f.endswith('.csv')]
print(len(filepaths))
#Go over the tables in the dataset
for idx, fp in enumerate(filepaths):
#     print(fp)
    table_id = fp[fp.rfind('/')+1:-4]
    table_header = pd.read_csv(fp,header=None, nrows=1).values[0][1:]
#     table_df = pd.read_csv(fp,header=None,skiprows=[0])
    table_df = pd.read_csv(fp, index_col=0)
#     print(table_header)
#     print(table_df)
#     table_df.set_axis(table_header, axis='columns', inplace=True)
#     if 'species' in table_header:
#         print(table_id)
#         print(table_df)
    
    for idx, col_type in enumerate(table_header):
        col = table_df.iloc[:, idx].convert_dtypes().astype(object).fillna('') #use iloc with index because types can occur twice in table
        if (col.count() == 0) or col.empty:
            print(fp)
            print(table_header)
            print(col)
            continue
            
        columns.append(list(map(str, col.to_list()))) #columns need to be converted to list of strings for preprocessing later on
        col_true_types.append(col_type)
#         col_ids.append(col_id)   

477


In [7]:
# print(columns)

In [27]:
# print(col_true_types)
print(len(col_true_types)) #amount of columns in benchmark

type_frequency_dict = dict(collections.Counter(col_true_types))
print(type_frequency_dict)

df_dict = {'type': list(type_frequency_dict.keys()), 'freq': list(type_frequency_dict.values())}
print(df_dict)
type_frequency_df = pd.DataFrame(df_dict)
alt.Chart(type_frequency_df.reset_index()).mark_bar(size=20).encode(
    x = alt.X('type',
              type='ordinal',
              title = 'Type',
              sort=alt.EncodingSortField(
                field="freq",  
                order="descending"),
              ),
    y = alt.Y('freq', title='Frequency'),
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).properties(width=800,height=200)

801
{'type': 146, 'species': 121, 'rank': 100, 'year': 102, 'class': 64, 'description': 34, 'name': 105, 'code': 15, 'category': 6, 'state': 21, 'component': 3, 'product': 3, 'status': 14, 'depth': 13, 'address': 2, 'country': 5, 'duration': 11, 'language': 3, 'order': 1, 'weight': 3, 'classification': 1, 'gender': 1, 'notes': 6, 'location': 2, 'range': 1, 'city': 5, 'team': 2, 'company': 4, 'manufacturer': 2, 'capacity': 1, 'county': 3, 'family': 1}
{'type': ['type', 'species', 'rank', 'year', 'class', 'description', 'name', 'code', 'category', 'state', 'component', 'product', 'status', 'depth', 'address', 'country', 'duration', 'language', 'order', 'weight', 'classification', 'gender', 'notes', 'location', 'range', 'city', 'team', 'company', 'manufacturer', 'capacity', 'county', 'family'], 'freq': [146, 121, 100, 102, 64, 34, 105, 15, 6, 21, 3, 3, 14, 13, 2, 5, 11, 3, 1, 3, 1, 1, 6, 2, 1, 5, 2, 4, 2, 1, 3, 1]}


### Init feature extraction models

In [9]:
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:09.801877 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:07.918878 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.189373 seconds.


[nltk_data] Downloading package punkt to /home/senn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/senn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
data = pd.Series(
    [
        *columns #expand/unpack the list
    ],
    name="values"
)

In [11]:
extract_features(
    join(path, "processed/temporary.csv"),
    data
)
feature_vectors = pd.read_csv(join(path, "processed/temporary.csv"), dtype=np.float32)

Extracting Features:   0%|▏                                                             | 2/801 [00:00<00:43, 18.55it/s]

Exporting 1588 column features


Extracting Features: 100%|████████████████████████████████████████████████████████████| 801/801 [00:11<00:00, 69.03it/s]


In [12]:
model = SherlockModel();
model.initialize_model_from_json(with_weights=True, model_id="sherlock");

W0419 12:03:12.877515 140244213270336 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0419 12:03:12.881202 140244213270336 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0419 12:03:12.892777 140244213270336 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Varia

In [13]:
predicted_labels = model.predict(feature_vectors, "sherlock")

In [14]:
# prediction_size = len(predicted_labels)
# print(prediction_size)
# print(len(col_true_types))
# print(col_true_types)
# print(predicted_labels)
# report = classification_report(col_true_types, predicted_labels, output_dict=True)
print(classification_report(col_true_types, predicted_labels))
# pretty_print_classification_report(report)

                precision    recall  f1-score   support

       address       0.01      1.00      0.01         1
           age       0.00      0.00      0.00         1
         album       0.00      0.00      0.00         0
          area       0.00      0.00      0.00         0
        artist       0.00      0.00      0.00         0
    birth Date       0.00      0.00      0.00         0
      capacity       0.00      0.00      0.00         1
      category       0.05      0.20      0.08         5
          city       0.83      1.00      0.91         5
         class       0.24      0.06      0.10        64
classification       0.25      1.00      0.40         1
          code       0.80      0.24      0.36        17
       command       0.00      0.00      0.00         0
       company       0.33      1.00      0.50         2
     component       0.00      0.00      0.00         3
       country       0.75      0.75      0.75         4
        county       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# extract results for further investigation
# Write out data for further investigation
path_out_true_types = '../../combined/results/true_types'
path_out_predictions = '../../combined/results/predictions'

# tt_df = pd.DataFrame(columns=['type'], data=col_true_types)
# tt_df.to_parquet(join(path_out_true_types, 'gittables_benchmark.parquet'))

# pick the code for the dataset you chose to predict
# original benchmark
# pred_df = pd.DataFrame(columns=['type'], data=predicted_labels)
# pred_df.to_parquet(join(path_out_predictions, 'sherlock_gittables_benchmark.parquet'))
# reannotated benchmark
pred_df = pd.DataFrame(columns=['type'], data=predicted_labels)
pred_df.to_parquet(join(path_out_predictions, 'sherlock_gittables_benchmark_reannotated.parquet'))

In [2]:
# Comparison of reannotation
path = '../data/data/gittables_benchmark/non_reannotated'
path_re = '../data/data/gittables_benchmark/reannotated'
species_related_re_count = 0
other_re_count = 0
files = [f for f in listdir(path) if f.endswith('.csv')]
print(len(files))
#Compare the tables in the datasets
for idx, fp in enumerate(files):
    table_header = pd.read_csv(join(path, fp),header=None, nrows=1).values[0][1:]
    table_header_re = pd.read_csv(join(path_re, fp),header=None, nrows=1).values[0][1:]
    if collections.Counter(table_header) != collections.Counter(table_header_re):
        print(fp)
        print(table_header)
        print(table_header_re)
        if 'species' in table_header_re:
            species_related_re_count += 1
        else:
            other_re_count += 1
            
# Look at percentage of re-annotations on species columns
print(species_related_re_count / (species_related_re_count+other_re_count))
print(species_related_re_count+other_re_count)

477
GitTables_2803.csv
['name' 'rank' 'year']
['species' 'rank' 'year']
GitTables_2928.csv
['code' 'name']
['code' 'category']
GitTables_2004.csv
['name' 'rank' 'species' 'year']
['species' 'rank' 'species' 'year']
GitTables_2853.csv
['name' 'rank' 'species' 'year']
['species' 'rank' 'species' 'year']
GitTables_2309.csv
['name' 'rank' 'year']
['species' 'rank' 'year']
GitTables_2637.csv
['name' 'rank' 'species' 'year']
['species' 'rank' 'species' 'year']
GitTables_2663.csv
['name' 'rank' 'species' 'year']
['species' 'rank' 'species' 'year']
GitTables_2508.csv
['origin' 'type']
['address' 'type']
GitTables_2891.csv
['name' 'rank' 'species' 'year']
['species' 'rank' 'species' 'year']
GitTables_2752.csv
['name' 'rank' 'year']
['species' 'rank' 'year']
GitTables_2347.csv
['name' 'rank' 'year']
['species' 'rank' 'year']
GitTables_2658.csv
['name' 'rank' 'year']
['species' 'rank' 'year']
GitTables_2213.csv
['code' 'language' 'name']
['type' 'language' 'name']
GitTables_2962.csv
['name' 'rank

## STOP HERE, NOT WORKING BECAUSE TOO LITTLE DATA TO TRAIN

In [16]:
# true_types_train, true_types_test, cols_train, cols_test = train_test_split(col_true_types, columns, test_size=0.3)
# true_types_test, true_types_validate, cols_test, cols_validate = train_test_split(true_types_test, cols_test, test_size=0.5)

# true_types_train = np.array([x.lower() for x in true_types_train])
# true_types_test = np.array([x.lower() for x in true_types_test])
# true_types_validate = np.array([x.lower() for x in true_types_validate])

# print(cols_train)

In [17]:
# print(len(cols_train))
# print(len(cols_test))
# print(len(cols_validate))

### Extract features

In [18]:
# data_train = pd.Series(
#     [
#         *cols_train
#     ],
#     name="values"
# )
# data_test = pd.Series(
#     [
#         *cols_test
#     ],
#     name="values"
# )
# data_validate = pd.Series(
#     [
#         *cols_validate
#     ],
#     name="values"
# )

In [19]:
# extract_features(
#     join(path, 'processed/temporary_train.csv'),
#     data_train
# )
# feature_vectors_train = pd.read_csv(join(path, 'processed/temporary_train.csv'), dtype=np.float32)
# extract_features(
#     join(path, 'processed/temporary_test.csv'),
#     data_test
# )
# feature_vectors_test = pd.read_csv(join(path, 'processed/temporary_test.csv'), dtype=np.float32)
# extract_features(
#     join(path, 'processed/temporary_validate.csv'),
#     data_validate
# )
# feature_vectors_validate = pd.read_csv(join(path, 'processed/temporary_validate.csv'), dtype=np.float32)

In [20]:
# model_id = "retrained_sherlock_gittables_benchmark"
# start = datetime.now()
# print(f'Started at {start}')

# model = SherlockModel()
# # Model will be stored with ID `model_id`, COMMENT 4 NEXT LINES IF TRAINED ONCE AND UNCOMMENT LAST LINE
# model.fit(feature_vectors_train,true_types_train, feature_vectors_validate, true_types_validate, model_id=model_id)
# print('Trained and saved new model.')
# print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')
# model.store_weights(model_id=model_id)
# # model.initialize_model_from_json(with_weights=True, model_id=model_id)

In [21]:
# predicted_labels_test = model.predict(feature_vectors_test, model_id)
# print(predicted_labels_test)

In [22]:
# print(classification_report(true_types_test, predicted_labels_test))