In [1]:
from src.dataset import get_raw_training_data, get_raw_testing_data, get_labels
from src.spacy_helper import format_data_as_spacy, get_spacy_model, train, predict

from src.metrics import score
from src.visualization import plot_confusion_matrix

import numpy as np
import time

In [None]:
raw_training_data = get_raw_training_data('./data/training_set.json')
raw_testing_data = get_raw_testing_data('./data/testing_set.json')

labels = get_labels(raw_training_data)

y_true = [r['intent'] for r in raw_testing_data]
y_true_t = [r['intent'] for r in raw_training_data]

X_train, y_train = format_data_as_spacy(raw_training_data)
X_test, y_test = format_data_as_spacy(raw_testing_data)


model = get_spacy_model(labels)

model = train(model, X_train, y_train, 10, test=(X_test, y_test))
model.to_disk(f"./model")

Training the model...
Iteration 0/10. train_loss: 16.647540825113538 test score:32.78635657227641%
Iteration 1/10. train_loss: 1.0961708325776272 test score:57.02361955408364%
Iteration 2/10. train_loss: 0.23071542207617313 test score:69.20552676043323%
Iteration 3/10. train_loss: 0.07479136149595433 test score:69.62322283852386%
Iteration 4/10. train_loss: 0.03675457484132494 test score:71.44087816568046%
Iteration 5/10. train_loss: 0.02437766979164735 test score:73.36817028440805%
Iteration 6/10. train_loss: 0.019655390253774385 test score:74.26034219550272%


In [None]:
y_pred = predict(model, X_test)
y_pred_t = predict(model, X_train)

In [None]:
training_results = score(y_true_t, y_pred_t)
testing_results = score(y_true, y_pred)

In [None]:
print(testing_results['report'])

In [None]:
cm_testing_metrics = np.array(testing_results['cm'], dtype='float64')
for i in range(len(cm_testing_metrics)):
    cm_testing_metrics[i] = cm_testing_metrics[i] / np.sum(cm_testing_metrics[i])
plot_confusion_matrix(cm_testing_metrics, labels, title="Test dataset Confusion Matrix (Normalized for recall)", fmt=".2f")

In [None]:
print(training_results['report'])

In [None]:
cm_testing_metrics = np.array(training_results['cm'], dtype='float64')
for i in range(len(cm_testing_metrics)):
    cm_testing_metrics[i] = cm_testing_metrics[i] / np.sum(cm_testing_metrics[i])
plot_confusion_matrix(cm_testing_metrics, labels, title="Training dataset Confusion Matrix (Normalized for recall)", fmt=".2f")