# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'retrained_sherlock_test'

In [3]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-03-28 19:41:48.448525
Load data (train) process took 0:00:07.321858 seconds.


In [5]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [6]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-03-28 19:41:56.040150
Load data (validation) process took 0:00:01.001728 seconds.


In [7]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-03-28 19:41:57.074744
Finished at 2022-03-28 19:41:58.048018, took 0:00:00.973293 seconds


## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

### Option 1: load Sherlock with pretrained weights

In [8]:
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel();
model.initialize_model_from_json(with_weights=True, model_id="sherlock");

print('Initialized model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

W0328 19:41:58.084947 140316294813504 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0328 19:41:58.085757 140316294813504 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0328 19:41:58.088379 140316294813504 deprecation.py:506] From /home/senn/virtualenvs/sherlock/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Varia

Started at 2022-03-28 19:41:58.071903


2022-03-28 19:41:58.466867: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-03-28 19:41:58.490058: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3193915000 Hz
2022-03-28 19:41:58.490786: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55ff43287950 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-03-28 19:41:58.490808: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


Initialized model.
Finished at 2022-03-28 19:41:58.694472, took 0:00:00.622580 seconds


### Option 2: fit Sherlock from scratch (and save for later use)

In [9]:
# model_id = "retrained_sherlock_test"

In [10]:
# start = datetime.now()
# print(f'Started at {start}')

# model = SherlockModel()
# # Model will be stored with ID `model_id`
# model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

# print('Trained and saved new model.')
# print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

In [11]:
# model.store_weights(model_id=model_id)

### Make prediction

In [12]:
predicted_labels = model.predict(X_test)
predicted_labels = np.array([x.lower() for x in predicted_labels])

In [13]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 137353, type = <class 'numpy.ndarray'>


0.8951410029373902

In [14]:
# If using the original model, model_id should be replaced with "sherlock"
# model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [15]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
grades		0.993		0.993		0.993		1765
isbn		0.991		0.993		0.988		1430
jockey		0.985		0.982		0.988		2819
industry	0.984		0.983		0.985		2958
birth date	0.977		0.985		0.969		479


### Bottom 5 Types

In [16]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
rank		0.693		0.625		0.778		2983
person		0.664		0.717		0.618		579
director	0.568		0.591		0.547		225
sales		0.556		0.586		0.528		322
ranking		0.441		0.753		0.312		439


### All Scores

In [17]:
print(classification_report(y_test, predicted_labels, digits=3))

                precision    recall  f1-score   support

       address      0.931     0.943     0.937      3003
     affiliate      0.943     0.809     0.871       204
   affiliation      0.973     0.957     0.965      1768
           age      0.866     0.950     0.906      3033
         album      0.892     0.889     0.890      3035
          area      0.870     0.820     0.844      1987
        artist      0.816     0.873     0.844      3043
    birth date      0.985     0.969     0.977       479
   birth place      0.934     0.921     0.928       418
         brand      0.830     0.671     0.742       574
      capacity      0.793     0.721     0.755       362
      category      0.924     0.890     0.906      3087
          city      0.864     0.904     0.883      2966
         class      0.901     0.915     0.908      2971
classification      0.927     0.862     0.893       587
          club      0.974     0.955     0.964      2977
          code      0.916     0.907     0.912  

## Review errors

In [18]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('state'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[440] expected "state" but predicted "sales"
[596] expected "state" but predicted "age"
[1486] expected "state" but predicted "genre"
[2917] expected "state" but predicted "class"
[4028] expected "state" but predicted "classification"
[5153] expected "state" but predicted "country"
[6938] expected "state" but predicted "city"
[7375] expected "state" but predicted "owner"
[7766] expected "state" but predicted "creator"
[8789] expected "state" but predicted "origin"
[8997] expected "state" but predicted "type"
[10567] expected "state" but predicted "team"
[10660] expected "state" but predicted "team name"
[11128] expected "state" but predicted "city"
[13568] expected "state" but predicted "region"
[14893] expected "state" but predicted "origin"
[19061] expected "state" but predicted "ranking"
[19412] expected "state" but predicted "region"
[19896] expected "state" but predicted "year"
[21010] expected "state" but predicted "country"
[21514] expected "state" but predicted "country"
[24489

[('name', 727),
 ('rank', 663),
 ('region', 521),
 ('location', 509),
 ('position', 491),
 ('description', 400),
 ('team', 390),
 ('artist', 385),
 ('notes', 364),
 ('type', 363),
 ('area', 357),
 ('category', 341),
 ('company', 340),
 ('album', 338),
 ('day', 329),
 ('product', 322),
 ('ranking', 302),
 ('gender', 287),
 ('city', 286),
 ('team name', 283),
 ('code', 274),
 ('class', 253),
 ('person', 221),
 ('owner', 219),
 ('weight', 203),
 ('status', 197),
 ('brand', 189),
 ('year', 189),
 ('credit', 176),
 ('result', 174),
 ('manufacturer', 171),
 ('address', 171),
 ('service', 167),
 ('order', 165),
 ('sex', 164),
 ('duration', 155),
 ('age', 153),
 ('sales', 152),
 ('country', 152),
 ('plays', 147),
 ('component', 147),
 ('origin', 144),
 ('range', 139),
 ('club', 133),
 ('nationality', 131),
 ('state', 129),
 ('county', 127),
 ('format', 120),
 ('director', 102),
 ('capacity', 101),
 ('command', 100),
 ('symbol', 94),
 ('publisher', 89),
 ('classification', 81),
 ('depth', 80),


In [19]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [20]:
idx = 1001
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

print(f'Finished at {datetime.now()}')

Predicted "address", actual label "address". Actual values:
[['Cabot House', 'Cabot House', '5 Hill Rd.', '5 Hill Rd.', '9 Cabot Rd.', '9 Cabot Rd.', 'Cabot House', '22 Bank Rd.', '22 Bank Rd.', 'Cabot House', '31 Bank Rd.', '31 Bank Rd.', 'Bairds Hotel', '11 Cabot Rd.', '11 Cabot Rd.', '10 Hill Rd.', '10 Hill Rd.', '10 Hill Rd.', '10 Hill Rd.', '7A Church Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '2 Coronation St.', '2 Coronation St.', '7A Church Rd.', '12 Hill Rd.', '12 Hill Rd.', '12 Hill Rd.', 'Cabot House', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '7A Church Rd.', '18 Mill Rd.', '17 Hill Rd.', '17 Hill Rd.', 'Cabot House', 'Cabot House', '25 Bank Rd.', '10 Coronation St.', '6 Cabot Rd.', '6 Cabot Rd.', '8 Hill Rd.', '8 Hill Rd.', '4 Mill Rd.', '4 Mill Rd.', '12 Sulva Rd.', '4 Haig Rd.', '13 Botwood Rd.', '13 Botwood Rd.', '8 Botwood Rd.', '8 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.