## Embedding des sequences avec CamamBERT puis classification par le TextClassifier de FLAIR

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import flair
from flair.data import Sentence
from flair.embeddings import CamembertEmbeddings
from flair.embeddings import TransformerWordEmbeddings
import pathlib
import os
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus

In [2]:
import torch
device = None
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

In [3]:
# Charger les jeux de données : test = jeu d'évaluation final pour flair, 
# Annoté sans active learning, parce que l'AL pourrait entraîner un biais.

train_set = pd.read_csv('train.csv', sep=';', encoding='utf-8')
dev_set = pd.read_csv('dev.csv', sep=';', encoding='utf-8')
test_set = pd.read_csv('test.csv', sep=';', encoding='utf-8')

train_set = train_set[['index', 'exemple', 'label']].set_index('index')
test_set = test_set[['index', 'exemple', 'label']].set_index('index')
dev_set = dev_set[['index', 'exemple', 'label']].set_index('index')

# Embedding

In [4]:
# Embedding des exemples avec camembert
# Un peu long

embedding = CamembertEmbeddings()

  after removing the cwd from sys.path.


In [5]:
## Corpus est l'outil fournis par FLAIR pour charger les jeux de données

# 1. get the corpus
path = os.getcwd()

# this is the folder in which train, test and dev files reside
data_folder = path = os.getcwd()

# column format indicating which columns hold the text and label(s)
column_name_map = {2: "text", 3: "label_topic"}

# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(data_folder,                                        
                                         column_name_map,
                                         skip_header=True,
                                         delimiter=';') 

# pour avoir une idée de à quoi ressemble les jeux de données, ça permet aussi de vérifier qu'on 
# ne s'est pas emmêler entre test et dev.

import flair.datasets 
stats = corpus.obtain_statistics()
print(stats)

2020-07-09 10:16:33,614 Reading data from /home/sophie/Documents/Classification_Flair_Camembert
2020-07-09 10:16:33,614 Train: /home/sophie/Documents/Classification_Flair_Camembert/train.csv
2020-07-09 10:16:33,614 Dev: /home/sophie/Documents/Classification_Flair_Camembert/dev.csv
2020-07-09 10:16:33,614 Test: /home/sophie/Documents/Classification_Flair_Camembert/test.csv
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 1781,
        "number_of_documents_per_class": {
            "0": 1416,
            "1": 365
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 55574,
            "min": 1,
            "max": 56,
            "avg": 31.20381807973049
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 624,
        "number_of_documents_per_class": {
            "1": 34,
            "0": 590
        },
        "number_of_tokens_per_tag": {},
        "number_of_toke

In [6]:
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

2020-07-09 10:17:34,898 Computing label dictionary. Progress:


100%|██████████| 2405/2405 [00:00<00:00, 4541.95it/s]

2020-07-09 10:17:35,548 [b'0', b'1']





In [7]:
# 3. make a list of word embeddings
word_embeddings = [embedding]

In [8]:
from flair.embeddings import DocumentRNNEmbeddings

# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                   hidden_size=512,
                                                                   reproject_words=True,
                                                                   reproject_words_dimension=256,
                                                                   )

# TextClassifier

In [9]:
from flair.models import TextClassifier

In [10]:
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [11]:
from flair.trainers import ModelTrainer

In [12]:
# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

In [13]:
# 7. start the training
trainer.train('resources/taggers/ag_news',
              learning_rate=0.2,
              mini_batch_size=16,
              anneal_factor=0.5,
              patience=5,
              max_epochs=10, # augmenter le nombre d'époch
              embeddings_storage_mode='gpu')

2020-07-09 10:19:16,511 ----------------------------------------------------------------------------------------------------
2020-07-09 10:19:16,513 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): CamembertEmbeddings(
        (model): CamembertModel(
          (embeddings): RobertaEmbeddings(
            (word_embeddings): Embedding(32005, 768, padding_idx=1)
            (position_embeddings): Embedding(514, 768, padding_idx=1)
            (token_type_embeddings): Embedding(1, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): BertEncoder(
            (layer): ModuleList(
              (0): BertLayer(
                (attention): BertAttention(
                  (self): BertSelfAttention(
                    (query): Linear(in_features=768, out_features=768, bias=True)
              

2020-07-09 10:19:16,513 ----------------------------------------------------------------------------------------------------
2020-07-09 10:19:16,514 Corpus: "Corpus: 1781 train + 764 dev + 624 test sentences"
2020-07-09 10:19:16,514 ----------------------------------------------------------------------------------------------------
2020-07-09 10:19:16,514 Parameters:
2020-07-09 10:19:16,515  - learning_rate: "0.2"
2020-07-09 10:19:16,515  - mini_batch_size: "16"
2020-07-09 10:19:16,515  - patience: "5"
2020-07-09 10:19:16,515  - anneal_factor: "0.5"
2020-07-09 10:19:16,516  - max_epochs: "10"
2020-07-09 10:19:16,516  - shuffle: "True"
2020-07-09 10:19:16,516  - train_with_dev: "False"
2020-07-09 10:19:16,517  - batch_growth_annealing: "False"
2020-07-09 10:19:16,517 ----------------------------------------------------------------------------------------------------
2020-07-09 10:19:16,517 Model training base path: "resources/taggers/ag_news"
2020-07-09 10:19:16,518 --------------------

2020-07-09 10:21:38,979 epoch 6 - iter 44/112 - loss 0.50774995 - samples/sec: 105.82
2020-07-09 10:21:40,831 epoch 6 - iter 55/112 - loss 0.49121912 - samples/sec: 100.67
2020-07-09 10:21:42,659 epoch 6 - iter 66/112 - loss 0.49028140 - samples/sec: 105.27
2020-07-09 10:21:44,480 epoch 6 - iter 77/112 - loss 0.49536724 - samples/sec: 102.32
2020-07-09 10:21:46,416 epoch 6 - iter 88/112 - loss 0.49131831 - samples/sec: 99.70
2020-07-09 10:21:48,310 epoch 6 - iter 99/112 - loss 0.48387189 - samples/sec: 97.47
2020-07-09 10:21:50,144 epoch 6 - iter 110/112 - loss 0.48473317 - samples/sec: 101.45
2020-07-09 10:21:50,468 ----------------------------------------------------------------------------------------------------
2020-07-09 10:21:50,469 EPOCH 6 done: loss 0.4843 - lr 0.2000000
2020-07-09 10:21:58,225 DEV : loss 0.4095252454280853 - score 0.8207
2020-07-09 10:21:58,456 BAD EPOCHS (no improvement): 0
saving best model
2020-07-09 10:21:59,396 -------------------------------------------

{'test_score': 0.9455128205128205,
 'dev_score_history': [0.7630890052356021,
  0.8206806282722513,
  0.8206806282722513,
  0.8206806282722513,
  0.20157068062827224,
  0.8206806282722513,
  0.8219895287958116,
  0.837696335078534,
  0.8206806282722513,
  0.8259162303664922],
 'train_loss_history': [0.5402974475707326,
  0.5038986390988741,
  0.49641720338591505,
  0.47775933645399554,
  0.48482047686619417,
  0.48428113672084044,
  0.46529376287279384,
  0.452166155845459,
  0.42398965052728144,
  0.41737897374800276],
 'dev_loss_history': [0.6084395051002502,
  0.4857785701751709,
  0.4684421718120575,
  0.5038332343101501,
  1.449820876121521,
  0.4095252454280853,
  0.4399270713329315,
  0.3870841860771179,
  0.49985700845718384,
  0.38226747512817383]}

In [None]:
## Pour fine tuner le modèle, Flair propose une 'grid search' des hyperparamètres

from flair.hyperparameter.param_selection import TextClassifierParamSelector, OptimizationValue
from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path
from flair.embeddings import StackedEmbeddings

# define your search space
search_space = SearchSpace()

search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[[ embedding ]])
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32])

In [None]:
# Lancer le selecteur d'hyperparamètres

param_selector = TextClassifierParamSelector(
    corpus=corpus, 
    multi_label=False, 
    base_path='resources/results',
    document_embedding_type='rnn',
    max_epochs=10, 
    training_runs=1,
    optimization_value=OptimizationValue.DEV_SCORE
)

param_selector.optimize(search_space, max_evals=100)

In [14]:
# Once the model is trained you can load it to predict the class of new sentences. 

# Just call the predict method of the model.

classifier = TextClassifier.load('resources/taggers/ag_news/final-model.pt')


# create example sentence

sentence = Sentence('complication precoce : ACR en salle de catheterimse reanime par adrenaline et massage cardiaque no flow 1 min low flow 5 min')

# predict class and print

classifier.predict(sentence)

print(sentence.labels)

2020-07-09 10:24:15,429 loading file resources/taggers/ag_news/final-model.pt
[0 (0.5260)]


# Evaluation

## Résultat pour le test set

In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sn
import math

In [17]:
# Predire le label des exemples dans le test set

y_pred = []
for row in test_set.iterrows():
    #print("Train example:", row[1].exemple)
    sentence = Sentence(row[1].exemple)
    classifier.predict(sentence)
    #print("Predicted class", sentence.labels)
    y_pred.append(sentence.labels)
    
y_pred = pd.DataFrame(y_pred)
df = y_pred.columns=['result']

In [18]:
df = y_pred['result'].astype(str).str[:1].astype(int)

y_pred = df.tolist()

In [22]:
y_true = test_set["label"]

y_true = y_true.tolist()

In [23]:
# Metriques

F1 = f1_score(y_true, y_pred, average='micro')
F2 = f1_score(y_true, y_pred, average=None)
RC1 = recall_score(y_true, y_pred, average='micro')
RC2 = recall_score(y_true, y_pred, average=None)
PC1 = precision_score(y_true, y_pred, average='micro')
PC2 = precision_score(y_true, y_pred, average=None)
AC = accuracy_score(y_true, y_pred)

print('global F1-score')
print(F1)
print('F1-score par classe')
print(F2)
print('Recall global')
print(RC1)
print('Recall par classe')
print(RC2)
print('Precision global')
print(PC1)
print('Precision par classe')
print(PC2)
print('Accuracy')
print(AC)

from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_true, y_pred)
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

global F1-score
0.9150641025641025
F1-score par classe
[0.95473954 0.31168831]
Recall global
0.9150641025641025
Recall par classe
[0.94745763 0.35294118]
Precision global
0.9150641025641025
Precision par classe
[0.96213425 0.27906977]
Accuracy
0.9150641025641025
Average precision-recall score: 0.13


In [None]:
# Matrice

CM = confusion_matrix(y_true, y_pred, labels=[1, 0])

array = CM

labels=[1, 0]


print(CM)

df_cm = pd.DataFrame(array, index=labels, columns=labels)
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, cmap="YlGnBu") # font size
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

## Résultats pour le dev set

In [27]:
# Obtenir la liste des predictions dans le dev

dev_pred = []
for row in dev_set.iterrows():
    #print("Train example:", row[1].exemple)
    sentence = Sentence(row[1].exemple)
    classifier.predict(sentence)
    #print("Predicted class", sentence.labels)
    dev_pred.append(sentence.labels)
    
dev_pred = pd.DataFrame(dev_pred)
df = dev_pred.columns=['result']
df = dev_pred['result'].astype(str).str[:1].astype(int)
dev_pred = df.tolist()

In [29]:
# Obtenir la liste des vrais labels dans le dev
dev_true = dev_set["label"]
dev_true = dev_true.tolist()

In [31]:
# Calculs des métriques pour le dev

F1 = f1_score(dev_true, dev_pred, average='micro')
F2 = f1_score(dev_true, dev_pred, average=None)
RC1 = recall_score(dev_true, dev_pred, average='micro')
RC2 = recall_score(dev_true, dev_pred, average=None)
PC1 = precision_score(dev_true, dev_pred, average='micro')
PC2 = precision_score(dev_true, dev_pred, average=None)
AC = accuracy_score(dev_true, dev_pred)
CM = confusion_matrix(dev_true, dev_pred, labels=[1, 0])


print('global F1-score')
print(F1)
print('F1-score par classe')
print(F2)
print('Confusion matrix')
print(CM)
print('Recall global')
print(RC1)
print('Recall par classe')
print(RC2)
print('Precision global')
print(PC1)
print('Precision par classe')
print(PC2)
print('Accuracy')
print(AC)


from sklearn.metrics import average_precision_score
average_precision = average_precision_score(dev_true, dev_pred)
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

global F1-score
0.912303664921466
F1-score par classe
[0.94678316 0.75092937]
Confusion matrix
[[101  36]
 [ 31 596]]
Recall global
0.912303664921466
Recall par classe
[0.95055821 0.73722628]
Precision global
0.912303664921466
Precision par classe
[0.94303797 0.76515152]
Accuracy
0.912303664921466
Average precision-recall score: 0.61


In [None]:
# Matrice de confusion pour le dev

CM = confusion_matrix(dev_true, dev_pred, labels=[1, 0])

array = CM

labels=[1, 0]

df_cm = pd.DataFrame(array, index=labels, columns=labels)
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, cmap="YlGn") # font size
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

# CLASSIFICATION DES SEQUENCES

In [27]:
unlab_set = pd.read_csv('unlab.csv', sep =';', encoding = "utf-8").set_index('Unnamed: 0')

In [None]:
# Classification des unlabeled

unlab = []
for row in unlab_set.iterrows():
    #print("Train example:", row[1].exemple)
    sentence = Sentence(row[1].texte_complication)
    classifier.predict(sentence)
    #print("Predicted class", sentence.labels)
    unlab.append(sentence.labels)
    
unlab = pd.DataFrame(unlab)
df = unlab.columns=['result']

In [93]:
unlab.to_csv("unlab_classifier.csv", sep=';', encoding = 'utf-8')