In [1]:
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from spacy_lefff import LefffLemmatizer, POSTagger
import json

import numpy as np


from spacy.util import minibatch, compounding
import random

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import seaborn as sn
import matplotlib.pyplot as plt

2020-11-12 17:00:52,609 - matplotlib - DEBUG - (private) matplotlib data path: /home/otavio/anaconda3/envs/engsw/lib/python3.8/site-packages/matplotlib/mpl-data
2020-11-12 17:00:52,610 - matplotlib - DEBUG - matplotlib data path: /home/otavio/anaconda3/envs/engsw/lib/python3.8/site-packages/matplotlib/mpl-data
2020-11-12 17:00:52,612 - matplotlib - DEBUG - CONFIGDIR=/home/otavio/.config/matplotlib
2020-11-12 17:00:52,614 - matplotlib - DEBUG - matplotlib version 3.3.2
2020-11-12 17:00:52,615 - matplotlib - DEBUG - interactive is False
2020-11-12 17:00:52,615 - matplotlib - DEBUG - platform is linux


2020-11-12 17:00:52,767 - matplotlib - DEBUG - CACHEDIR=/home/otavio/.cache/matplotlib
2020-11-12 17:00:52,769 - matplotlib.font_manager - DEBUG - Using fontManager instance from /home/otavio/.cache/matplotlib/fontlist-v330.json
2020-11-12 17:00:52,840 - matplotlib.pyplot - DEBUG - Loaded backend module://ipykernel.pylab.backend_inline version unknown.
2020-11-12 17:00:52,844 - matplotlib.pyplot - DEBUG - Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [2]:
def load_json_data(filename: str) -> dict:
    """
    Load the content of the json file as a dict.
    """
    f = open(filename, 'r')
    file_content = json.load(f)
    f.close()
    return file_content


def get_raw_training_data(path: str = './training_set.json'):
    """
    Load the raw training set.
    """
    return load_json_data(path)


def get_raw_testing_data(path: str = './testing_set.json'):
    """
    Load the raw testing set.
    """
    return load_json_data(path)

In [3]:
raw_training_data = get_raw_training_data()
y = [v['intent'] for v in raw_training_data]

In [4]:
nlp = spacy.load('fr_core_news_sm')

textcat=nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

In [5]:
for label in set(y):
    textcat.add_label(label)

In [6]:
def format_data(data):

    texts = [td['sentence'] for td in data]
    labels = [td['intent'] for td in data]

    cats = [{'find-around-me': y == 'find-around-me',
             'purchase': y == 'purchase',
             'find-hotel': y == 'find-hotel',
             'provide-showtimes': y == 'provide-showtimes',
             'irrelevant': y == 'irrelevant',
             'find-train': y == 'find-train',
             'find-flight': y == 'find-flight',
             'find-restaurant': y == 'find-restaurant'} for y in labels]
    

    return texts, cats

In [7]:
train_texts, train_cats= format_data(raw_training_data)


In [8]:
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

In [9]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [10]:
n_iter=10

# get names of other pipes to disable them during training
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    '''
    if init_tok2vec is not None:
        with init_tok2vec.open("rb") as file_:
            textcat.model.tok2vec.from_bytes(file_.read())
    '''
    print("Training the model...")
    
    # print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        print(i)
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        '''
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print(
            "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                losses["textcat"],
                scores["textcat_p"],
                scores["textcat_r"],
                scores["textcat_f"],
            )
        )
        '''

Training the model...
0
1
2
3
4
5
6
7
8
9


In [11]:
raw_testing_data = get_raw_testing_data()
dev_texts, dev_cats = format_data(raw_testing_data)
y_true = [r['intent'] for r in raw_testing_data]

In [12]:
predict = []
for text in dev_texts:
    predict.append(nlp(text).cats)

In [13]:
y_pred=[max(result, key=result.get) for result in predict ]

In [14]:
labels = list(set(y))
report = classification_report(y_true, y_pred, zero_division=0),
cm = confusion_matrix(y_true, y_pred, labels=labels)

In [15]:
def plot_confusion_matrix(cm, title='Confusion Matrix', fmt="d"):
    """
    Plot the specified confusion matrix with seaborn.
    """
    labels = list(set([v['intent'] for v in get_raw_testing_data()]))

    plt.figure(figsize = (10,7))

    ax = sn.heatmap(cm, 
                    annot=True, 
                    fmt=fmt, 
                    cmap="Blues_r", 
                    xticklabels=labels, 
                    yticklabels=labels) 

    ax.yaxis.set_ticklabels(ax.yaxis.get_ticklabels(), rotation=0, ha='right')
    ax.xaxis.set_ticklabels(ax.xaxis.get_ticklabels(), rotation=45, ha='right')

    ax.set(title=title,
                 ylabel='Predicted label',
                 xlabel='True label')

    plt.show()

In [21]:
print(report[0])

                   precision    recall  f1-score   support

   find-around-me       0.46      0.76      0.58        67
      find-flight       0.80      0.50      0.62        24
       find-hotel       0.67      0.65      0.66        55
  find-restaurant       0.88      0.77      0.82        93
       find-train       0.72      0.86      0.78        21
       irrelevant       0.91      0.88      0.90       677
provide-showtimes       0.89      0.57      0.70        14
         purchase       0.74      0.71      0.73       114

         accuracy                           0.82      1065
        macro avg       0.76      0.71      0.72      1065
     weighted avg       0.84      0.82      0.83      1065

