In [6]:
#%pip install lime

In [21]:
import utils_data as utils
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torch.optim import Adam

import math

from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import gc

#from lime import lime_text
from numpy import random

device = "cuda" if torch.cuda.is_available() else "cpu"

### Importation du dataset

In [8]:
text_train, text_val = utils.get_data_split()
label_train, label_val = utils.get_labels_split()
text_test = utils.get_test()

merged_train = utils.merge_data_labels(text_train, label_train)
merged_val = utils.merge_data_labels(text_val, label_val)

In [9]:
# # Solution pour randomiser la séparation train / val
text_all = utils.get_data()
labels_all = utils.get_labels()
limit = int(len(labels_all) * 0.8) + 1

merged_all = utils.merge_data_labels(text_all, labels_all)
random.shuffle(merged_all)

merged_train, merged_val = merged_all[:limit], merged_all[limit:] # Séparation entre les données de train et de validation (80% - 20%)
print(merged_train[0])

('"hello tomorrow, my stomach hurts "\n', 0)


Note importante : toute la suite est inspiré de : https://coderzcolumn.com/tutorials/artificial-intelligence/pytorch-rnn-for-text-classification-tasks, ainsi que https://n8henrie.com/2021/08/writing-a-transformer-classifier-in-pytorch/ et du cours IFT 6135-A2022.

### Création de tokens à partir des phrases

In [10]:
tokenizer = get_tokenizer("basic_english")

def build_vocabulary(datasets):
    for dataset in datasets:
        for text in dataset:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocabulary([text_train, text_val, text_test]), min_freq=1, specials=["<UNK>"])
vocab.set_default_index(vocab["<UNK>"])
len(vocab)

781347

### Préparation des données

In [11]:
data_train, data_val = to_map_style_dataset(merged_train), to_map_style_dataset(merged_val)
data_test = to_map_style_dataset(text_test)

target_classes = ["negative", "neutral", "positive"]
max_words = 25

def vectorize_batch(batch):
    X, Y = list(zip(*batch))
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32), torch.tensor(Y, dtype = torch.long)


def vectorize_test_batch(batch):
    X = list(batch)
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32)

train_loader = DataLoader(data_train, batch_size=1024, collate_fn=vectorize_batch, shuffle=True)
val_loader = DataLoader(data_val, batch_size=1024, collate_fn=vectorize_batch)
test_loader = DataLoader(data_test, batch_size=1024, collate_fn=vectorize_test_batch)

### Définition du Transformer

In [12]:
class PositionalEncoding(nn.Module):
    """
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(self, d_model, vocab_size=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(vocab_size, d_model)
        position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float()
            * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)

In [25]:
embed_len = 50
hidden_dim = 50
n_layers = 4
p_dropout = 0.5

class Transformer(nn.Module):
  def __init__(self, nhead=5, dim_feedforward=2048, num_layers=6, dropout=0.1, activation="relu", classifier_dropout=0.1):

    super().__init__()
    self.d_model = embed_len

    assert self.d_model % nhead == 0, "nheads must divide evenly into d_model"
    self.emb = nn.Embedding(num_embeddings=len(vocab), embedding_dim=self.d_model)
    self.pos_encoder = PositionalEncoding(d_model=self.d_model, dropout=dropout, vocab_size=len(vocab))
    encoder_layer = nn.TransformerEncoderLayer(d_model=self.d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
    self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
    self.classifier = nn.Linear(self.d_model, len(target_classes))
    

  def forward(self, x):
    x = self.emb(x) * math.sqrt(self.d_model)
    x = self.pos_encoder(x)
    x = self.transformer_encoder(x)
    x = x.mean(dim=1)
    x = self.classifier(x)

    return x

### Entraînement du Transformer

In [18]:
def evaluate(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_true, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            Y_true.append(Y)
            Y = torch.tensor(utils.to_one_hot(Y.numpy(), len(target_classes)))
            X, Y = X.to(device), Y.to(device)
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())
            Y_preds.append(preds.argmax(dim=-1))

        Y_true = torch.cat(Y_true)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f} | Valid Acc : {:.3f}".format(torch.tensor(losses).mean(), accuracy_score(Y_true.detach().cpu().numpy(), Y_preds.detach().cpu().numpy())))


def train(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        Y_true, Y_preds, losses = [],[],[]
        for X, Y in tqdm(train_loader):
            Y_true.append(Y)
            Y = torch.tensor(utils.to_one_hot(Y.numpy(), len(target_classes)))
            X, Y = X.to(device), Y.to(device)
            optimizer.zero_grad()
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())
            Y_preds.append(preds.argmax(dim=-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

        Y_true = torch.cat(Y_true)
        Y_preds = torch.cat(Y_preds)
            
        print("Epoch {} | Train Loss : {:.3f} | Train acc : {:.3f}".format(i, torch.tensor(losses).mean(), accuracy_score(Y_true.detach().cpu().numpy(), Y_preds.detach().cpu().numpy())))
        evaluate(model, loss_fn, val_loader)
        model.train()

In [29]:
epochs = 5
learning_rate = 0.001

loss_fn = nn.CrossEntropyLoss()
transformer = Transformer(nhead=5, dim_feedforward=50, dropout=0.0, classifier_dropout=0.0).to(device)
optimizer = Adam(transformer.parameters(), lr=learning_rate)

train(transformer, loss_fn, optimizer, train_loader, val_loader, epochs)

100%|██████████| 813/813 [04:00<00:00,  3.38it/s]


Epoch 1 | Train Loss : 0.573 | Train acc : 0.705
Valid Loss : 0.522 | Valid Acc : 0.756


100%|██████████| 813/813 [04:00<00:00,  3.38it/s]


Epoch 2 | Train Loss : 0.500 | Train acc : 0.767
Valid Loss : 0.491 | Valid Acc : 0.774


100%|██████████| 813/813 [04:00<00:00,  3.38it/s]


Epoch 3 | Train Loss : 0.474 | Train acc : 0.785
Valid Loss : 0.483 | Valid Acc : 0.779


100%|██████████| 813/813 [04:00<00:00,  3.38it/s]


Epoch 4 | Train Loss : 0.453 | Train acc : 0.797
Valid Loss : 0.481 | Valid Acc : 0.780


100%|██████████| 813/813 [04:00<00:00,  3.38it/s]


Epoch 5 | Train Loss : 0.432 | Train acc : 0.810
Valid Loss : 0.482 | Valid Acc : 0.781


### Prédictions sur l'ensemble de test

In [31]:
def predict(model, loader):
    Y_preds = []
    for X in tqdm(loader):
        X = X.to(device)
        preds = model(X)
        Y_preds.append(preds.detach().cpu())
    gc.collect()
    Y_preds = torch.cat(Y_preds)

    return F.softmax(Y_preds, dim=-1).argmax(dim=-1).numpy()

Y_preds = predict(transformer, test_loader)

100%|██████████| 548/548 [01:02<00:00,  8.83it/s]


### Enregistrement des résultats

In [32]:
utils.save_results(Y_preds, "Transformer")

### Explicabilité : Matrice de confusion et LIME

In [None]:
# val_loader_test = DataLoader(text_val, batch_size=1024, collate_fn=vectorize_test_batch)
# Y_preds_val = predict(transformer, val_loader_test)
# Y_actual_val = label_val[:, 1]

# print("Test Accuracy : {}".format(accuracy_score(Y_actual_val, Y_preds_val)))
# print("\nClassification Report : ")
# print(classification_report(Y_actual_val, Y_preds_val, target_names=target_classes))
# print("\nConfusion Matrix : ")
# print(confusion_matrix(Y_actual_val, Y_preds_val))

In [None]:
# X_test_text, Y_test = [], []
# for X, Y in merged_val:
#     X_test_text.append(X)
#     Y_test.append(Y)

# explainer = lime_text.LimeTextExplainer(class_names=target_classes, verbose=True)

# def make_predictions_lime(X_batch_text):
#     X = [vocab(tokenizer(text)) for text in X_batch_text]
#     X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.
#     logits = transformer(torch.tensor(X, dtype=torch.int32, device = device))
#     preds = F.softmax(logits, dim=-1)
#     return preds.detach().cpu().numpy()

# idx = int(random.uniform(0, len(Y_test), 1))
# X = [vocab(tokenizer(text)) for text in X_test_text[idx:idx+1]]
# X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.
# preds = transformer(torch.tensor(X, dtype=torch.int32, device = device))
# preds = F.softmax(preds, dim=-1)

# explanation = explainer.explain_instance(X_test_text[idx], classifier_fn=make_predictions_lime,
#                                          labels=Y_test[idx:idx+1])
# explanation.show_in_notebook()
# print("Prediction : ", target_classes[preds.argmax()])
# print("Actual :     ", target_classes[Y_test[idx]])