# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [1]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13


In [2]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'retrained_sherlock'

In [3]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-12-07 16:27:46.475810
Load data (train) process took 0:00:01.770314 seconds.


In [5]:
len(np.unique(y_train))

42

In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [7]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-12-07 16:27:48.446367
Load data (validation) process took 0:00:00.410649 seconds.


In [8]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-12-07 16:27:48.901753
Finished at 2022-12-07 16:27:49.319021, took 0:00:00.417285 seconds


## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

In [9]:
model_id = "retrained_sherlock"

In [10]:
model = SherlockModel()
try:
    model.initialize_model_from_json(with_weights=True, model_id=model_id);
except:
    start = datetime.now()
    print(f'Started at {start}')
    # Model will be stored with ID `model_id`
    model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

    print('Trained and saved new model.')
    print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')
    model.store_weights(model_id=model_id)

Started at 2022-12-07 16:27:49.401567


2022-12-07 16:27:50.277268: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-07 16:27:50.297034: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10000


W1207 16:27:51.540367 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x2aab3f42e3a0> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'

W1207 16:28:08.724171 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_test_function.<locals>.test_function at 0x2aad15a85670> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000


Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoch 68/10000
Epoch 69/10000
Epoch 70/10000
Epoch 71/10000
Epoch 72/10000
Epoch 73/10000
Epoch 74/10000
Epoch 75/10000
Epoch 76/10000
Epoch 77/10000
Epoch 78/10000
Epoch 79/10000
Epoch 80/10000
Epoch 81/10000
Epoch 82/10000
Epoch 83/10000
Epoch 84/10000
Trained and saved new model.
Finished at 2022-12-07 16:51:15.475127, took 0:23:26.073575 seconds


In [11]:
predicted_labels = []

### Make prediction

In [12]:
predicted_labels = model.predict(X_test, model_id)
predicted_labels = np.array([x.lower() for x in predicted_labels])

W1207 16:51:20.375893 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x2aab57bbb0d0> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [13]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 67959, type = <class 'numpy.ndarray'>


0.9199339349133125

In [14]:
# If using the original model, model_id should be replaced with "sherlock"
#model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [15]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
lei		1.000		1.000		1.000		110
credit card account numbers	0.998		0.997		1.000		585
guuid		0.996		0.992		1.000		120
industry	0.987		0.986		0.987		2958
phone number	0.985		0.971		1.000		300


### Bottom 5 Types

In [16]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
person		0.707		0.748		0.670		579
sales		0.660		0.753		0.587		322
fips code	0.640		0.718		0.577		97
naic		0.395		0.773		0.266		64
mcc code	0.000		0.000		0.000		29


### All Scores

In [17]:
print(classification_report(y_test, predicted_labels, digits=3))

                             precision    recall  f1-score   support

                    address      0.944     0.947     0.946      3003
                        age      0.919     0.972     0.945      3033
                       area      0.900     0.852     0.875      1987
                 birth date      0.975     0.975     0.975       479
                birth place      0.977     0.909     0.942       418
                      brand      0.830     0.749     0.788       574
                       city      0.884     0.910     0.897      2966
                  continent      0.872     0.903     0.887       227
                    country      0.936     0.951     0.943      3038
                     county      0.956     0.955     0.955      2959
credit card account numbers      0.997     1.000     0.998       585
                   currency      0.978     0.973     0.975       405
                        day      0.936     0.917     0.926      3038
                   duration      

## Review errors

In [18]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[478] expected "address" but predicted "name"
[1111] expected "address" but predicted "location"
[1201] expected "address" but predicted "location"
[1585] expected "address" but predicted "city"
[2041] expected "address" but predicted "county"
[2059] expected "address" but predicted "city"
[2393] expected "address" but predicted "location"
[2609] expected "address" but predicted "name"
[3342] expected "address" but predicted "location"
[4134] expected "address" but predicted "location"
[4428] expected "address" but predicted "location"
[4537] expected "address" but predicted "location"
[4640] expected "address" but predicted "city"
[4787] expected "address" but predicted "rank"
[5261] expected "address" but predicted "county"
[5685] expected "address" but predicted "location"
[6033] expected "address" but predicted "location"
[7008] expected "address" but predicted "location"
[7281] expected "address" but predicted "city"
[7390] expected "address" but predicted "location"
[7404] expect

[('location', 430),
 ('region', 408),
 ('rank', 387),
 ('name', 373),
 ('area', 294),
 ('city', 266),
 ('day', 252),
 ('type', 247),
 ('order', 246),
 ('product', 216),
 ('person', 191),
 ('address', 158),
 ('status', 155),
 ('country', 150),
 ('brand', 144),
 ('duration', 141),
 ('manufacturer', 138),
 ('year', 137),
 ('county', 134),
 ('sales', 133),
 ('state', 114),
 ('range', 101),
 ('nationality', 85),
 ('age', 85),
 ('sex', 65),
 ('naic', 47),
 ('language', 41),
 ('symbol', 41),
 ('fips code', 41),
 ('birth place', 38),
 ('industry', 38),
 ('mcc code', 29),
 ('zip code', 23),
 ('continent', 22),
 ('birth date', 12),
 ('currency', 11),
 ('social_security', 9),
 ('tax_id', 1)]

In [19]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [20]:
idx = 57


original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "symbol", actual label "symbol". Actual values:
[['VASC', 'TNK', 'NAT', 'GPRO', 'MANH']]


In [21]:
class_scores

[('lei', {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 110}),
 ('credit card account numbers',
  {'precision': 0.9965928449744463,
   'recall': 1.0,
   'f1-score': 0.9982935153583617,
   'support': 585}),
 ('guuid',
  {'precision': 0.9917355371900827,
   'recall': 1.0,
   'f1-score': 0.995850622406639,
   'support': 120}),
 ('industry',
  {'precision': 0.9861533265788585,
   'recall': 0.9871534820824882,
   'f1-score': 0.9866531508700793,
   'support': 2958}),
 ('phone number',
  {'precision': 0.970873786407767,
   'recall': 1.0,
   'f1-score': 0.9852216748768473,
   'support': 300}),
 ('tax_id',
  {'precision': 1.0,
   'recall': 0.9666666666666667,
   'f1-score': 0.983050847457627,
   'support': 30}),
 ('sex',
  {'precision': 0.9842228935884525,
   'recall': 0.9783116449783117,
   'f1-score': 0.9812583668005355,
   'support': 2997}),
 ('currency',
  {'precision': 0.9776674937965261,
   'recall': 0.9728395061728395,
   'f1-score': 0.9752475247524752,
   'support': 405})