In [3]:
#%pip install lime

In [4]:
import utils_data as utils
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torch.optim import Adam

from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import gc

#from lime import lime_text
from numpy import random

device = "cuda" if torch.cuda.is_available() else "cpu"

Mounted at /content/drive


### Importation du dataset

In [5]:
text_train, text_val = utils.get_data_split()
label_train, label_val = utils.get_labels_split()
text_test = utils.get_test()

merged_train = utils.merge_data_labels(text_train, label_train)
merged_val = utils.merge_data_labels(text_val, label_val)

In [6]:
# Solution pour randomiser la séparation train / val
text_all = utils.get_data()
labels_all = utils.get_labels()
limit = int(len(labels_all) * 0.85) + 1

merged_all = utils.merge_data_labels(text_all, labels_all)
random.shuffle(merged_all)

merged_train, merged_val = merged_all[:limit], merged_all[limit:] # Séparation entre les données de train et de validation (85% - 15%)

### Séparation des données pour chaque expert

In [7]:
nb_experts = 5
limit_expert = int(len(labels_all) * 0.8) + 1

merged_train_set = []
for i in range(nb_experts):
  random.shuffle(merged_all)
  merged_train_set.append(merged_all[:limit_expert])

Note importante : toute la suite est inspiré de : https://coderzcolumn.com/tutorials/artificial-intelligence/pytorch-rnn-for-text-classification-tasks, ainsi que du cours IFT 6135-A2022.

### Création de tokens à partir des phrases

In [8]:
tokenizer = get_tokenizer("basic_english")

def build_vocabulary(datasets):
    for dataset in datasets:
        for text in dataset:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocabulary([text_train, text_val, text_test]), min_freq=1, specials=["<UNK>"])
vocab.set_default_index(vocab["<UNK>"])
len(vocab)

781347

### Préparation des données

In [9]:
data_train_experts = []
data_val = to_map_style_dataset(merged_val)
data_test = to_map_style_dataset(text_test)
for i in range(nb_experts):
  data_train_experts.append(to_map_style_dataset(merged_train_set[i]))

target_classes = ["negative", "neutral", "positive"]
max_words = 25

def vectorize_batch(batch):
    X, Y = list(zip(*batch))
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32), torch.tensor(Y, dtype = torch.long)


def vectorize_test_batch(batch):
    X = list(batch)
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32)

train_loader_experts = []
val_loader = DataLoader(data_val, batch_size=1024, collate_fn=vectorize_batch)
test_loader = DataLoader(data_test, batch_size=1024, collate_fn=vectorize_test_batch)

for i in range(nb_experts):
  train_loader_experts.append(DataLoader(data_train_experts[i], batch_size=1024, collate_fn=vectorize_batch, shuffle=True))

### Définition du RNN (LSTM)

In [10]:
embed_len = 50
hidden_dim = 50
n_layers = 4
p_dropout = 0.5

class RNNClassifier(nn.Module):
    def __init__(self):
        super(RNNClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.rnn = nn.LSTM(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers, dropout=p_dropout, batch_first=True)
        self.linear = nn.Linear(hidden_dim, len(target_classes))

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch).to(device)
        output, hidden = self.rnn(embeddings)
        return self.linear(output[:,-1])

### Entraînement du RNN

In [11]:
def evaluate(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_losses_experts, Y_preds_experts = [], []
        for i in range(nb_experts):
          model[i].eval()
          Y_true, Y_preds, losses = [],[],[]
          for X, Y in val_loader:
              Y_true.append(Y)
              Y = torch.tensor(utils.to_one_hot(Y.numpy(), len(target_classes)))
              X, Y = X.to(device), Y.to(device)
              preds = model[i](X)
              loss = loss_fn(preds, Y)
              losses.append(loss.item())
              Y_preds.append(preds)

          Y_true = torch.cat(Y_true)
          Y_preds = torch.cat(Y_preds)

          Y_losses_experts.append(torch.tensor(losses).mean())
          Y_preds_experts.append(Y_preds)

          model[i].train()

        Y_losses_experts = torch.tensor(Y_losses_experts).mean()
        Y_preds_experts = F.softmax(torch.stack(Y_preds_experts, dim = -1).mean(dim = -1), dim = -1).argmax(dim = -1)

        print("Valid Loss : {:.3f} | Valid Acc : {:.3f}".format(Y_losses_experts, accuracy_score(Y_true.detach().cpu().numpy(), Y_preds_experts.detach().cpu().numpy())))
        
def train(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for epoch in range(1, epochs+1):
      Y_losses_experts, Y_acc_experts = [], []
      for i in range(nb_experts):
        Y_true, Y_preds, losses = [],[],[]
        for X, Y in tqdm(train_loader[i]):
            Y_true.append(Y)
            Y = torch.tensor(utils.to_one_hot(Y.numpy(), len(target_classes)))
            X, Y = X.to(device), Y.to(device)
            optimizer[i].zero_grad()
            preds = model[i](X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())
            Y_preds.append(F.softmax(preds, dim = -1).argmax(dim = -1))
            loss.backward()
            optimizer[i].step()

        Y_true = torch.cat(Y_true)
        Y_preds = torch.cat(Y_preds)

        Y_losses_experts.append(torch.tensor(losses).mean())
        Y_acc_experts.append(accuracy_score(Y_true.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()))

      Y_losses_experts = torch.tensor(Y_losses_experts).mean()
      Y_acc_experts = torch.tensor(Y_acc_experts).mean()
            
      print("Epoch {} | Train Loss (moy) : {:.3f} | Train acc (moy) : {:.3f}".format(epoch, Y_losses_experts, Y_acc_experts))
      evaluate(model, loss_fn, val_loader)

In [12]:
epochs = 10
learning_rate = 0.001

loss_fn = nn.CrossEntropyLoss()
rnn_classifiers, optimizers = [], []
for i in range(nb_experts):
  rnn_classifiers.append(RNNClassifier().to(device))
  optimizers.append(Adam(rnn_classifiers[i].parameters(), lr=learning_rate))

train(rnn_classifiers, loss_fn, optimizers, train_loader_experts, val_loader, epochs)

100%|██████████| 813/813 [00:29<00:00, 27.26it/s]
100%|██████████| 813/813 [00:28<00:00, 28.47it/s]
100%|██████████| 813/813 [00:27<00:00, 29.21it/s]
100%|██████████| 813/813 [00:27<00:00, 29.53it/s]
100%|██████████| 813/813 [00:27<00:00, 29.31it/s]


Epoch 1 | Train Loss (moy) : 0.610 | Train acc (moy) : 0.633
Valid Loss : 0.471 | Valid Acc : 0.795


100%|██████████| 813/813 [00:27<00:00, 29.11it/s]
100%|██████████| 813/813 [00:27<00:00, 29.05it/s]
100%|██████████| 813/813 [00:28<00:00, 28.59it/s]
100%|██████████| 813/813 [00:28<00:00, 28.16it/s]
100%|██████████| 813/813 [00:28<00:00, 28.60it/s]


Epoch 2 | Train Loss (moy) : 0.446 | Train acc (moy) : 0.792
Valid Loss : 0.415 | Valid Acc : 0.822


100%|██████████| 813/813 [00:27<00:00, 29.49it/s]
100%|██████████| 813/813 [00:27<00:00, 29.38it/s]
100%|██████████| 813/813 [00:28<00:00, 28.91it/s]
100%|██████████| 813/813 [00:27<00:00, 29.40it/s]
100%|██████████| 813/813 [00:28<00:00, 28.93it/s]


Epoch 3 | Train Loss (moy) : 0.406 | Train acc (moy) : 0.816
Valid Loss : 0.387 | Valid Acc : 0.840


100%|██████████| 813/813 [00:28<00:00, 28.81it/s]
100%|██████████| 813/813 [00:27<00:00, 29.61it/s]
100%|██████████| 813/813 [00:28<00:00, 29.03it/s]
100%|██████████| 813/813 [00:27<00:00, 29.93it/s]
100%|██████████| 813/813 [00:27<00:00, 29.15it/s]


Epoch 4 | Train Loss (moy) : 0.376 | Train acc (moy) : 0.833
Valid Loss : 0.363 | Valid Acc : 0.855


100%|██████████| 813/813 [00:27<00:00, 29.69it/s]
100%|██████████| 813/813 [00:27<00:00, 29.62it/s]
100%|██████████| 813/813 [00:27<00:00, 29.07it/s]
100%|██████████| 813/813 [00:28<00:00, 28.85it/s]
100%|██████████| 813/813 [00:27<00:00, 29.11it/s]


Epoch 5 | Train Loss (moy) : 0.349 | Train acc (moy) : 0.848
Valid Loss : 0.341 | Valid Acc : 0.870


100%|██████████| 813/813 [00:27<00:00, 29.73it/s]
100%|██████████| 813/813 [00:28<00:00, 29.01it/s]
100%|██████████| 813/813 [00:27<00:00, 29.87it/s]
100%|██████████| 813/813 [00:27<00:00, 29.85it/s]
100%|██████████| 813/813 [00:27<00:00, 29.20it/s]


Epoch 6 | Train Loss (moy) : 0.322 | Train acc (moy) : 0.863
Valid Loss : 0.321 | Valid Acc : 0.884


100%|██████████| 813/813 [00:28<00:00, 28.80it/s]
100%|██████████| 813/813 [00:27<00:00, 29.63it/s]
100%|██████████| 813/813 [00:27<00:00, 29.25it/s]
100%|██████████| 813/813 [00:27<00:00, 29.88it/s]
100%|██████████| 813/813 [00:27<00:00, 29.06it/s]


Epoch 7 | Train Loss (moy) : 0.296 | Train acc (moy) : 0.876
Valid Loss : 0.305 | Valid Acc : 0.895


100%|██████████| 813/813 [00:27<00:00, 29.53it/s]
100%|██████████| 813/813 [00:27<00:00, 29.47it/s]
100%|██████████| 813/813 [00:27<00:00, 29.04it/s]
100%|██████████| 813/813 [00:28<00:00, 28.91it/s]
100%|██████████| 813/813 [00:27<00:00, 29.04it/s]


Epoch 8 | Train Loss (moy) : 0.271 | Train acc (moy) : 0.889
Valid Loss : 0.291 | Valid Acc : 0.907


100%|██████████| 813/813 [00:27<00:00, 29.75it/s]
100%|██████████| 813/813 [00:27<00:00, 29.72it/s]
100%|██████████| 813/813 [00:28<00:00, 28.93it/s]
100%|██████████| 813/813 [00:27<00:00, 29.80it/s]
100%|██████████| 813/813 [00:27<00:00, 29.17it/s]


Epoch 9 | Train Loss (moy) : 0.249 | Train acc (moy) : 0.900
Valid Loss : 0.278 | Valid Acc : 0.914


100%|██████████| 813/813 [00:28<00:00, 28.80it/s]
100%|██████████| 813/813 [00:27<00:00, 29.86it/s]
100%|██████████| 813/813 [00:27<00:00, 29.34it/s]
100%|██████████| 813/813 [00:27<00:00, 29.85it/s]
100%|██████████| 813/813 [00:27<00:00, 29.28it/s]


Epoch 10 | Train Loss (moy) : 0.229 | Train acc (moy) : 0.909
Valid Loss : 0.269 | Valid Acc : 0.922


### Prédictions sur l'ensemble de test

In [None]:
def predict(model, loader):
    Y_preds_experts = []
    for i in range(nb_experts):
      Y_preds = []
      for X in tqdm(loader):
          X = X.to(device)
          preds = model[i](X)
          Y_preds.append(preds.detach().cpu())
      gc.collect()
      Y_preds = torch.cat(Y_preds)
    Y_preds_experts.append(Y_preds)

    return F.softmax(torch.stack(Y_preds_experts, dim = -1).mean(dim = -1), dim = -1).argmax(dim = -1).numpy()

Y_preds = predict(rnn_classifiers, test_loader)

100%|██████████| 548/548 [00:13<00:00, 42.12it/s]
100%|██████████| 548/548 [00:12<00:00, 42.71it/s]
 23%|██▎       | 128/548 [00:02<00:12, 33.63it/s]

### Enregistrement des résultats

In [14]:
utils.save_results(Y_preds, "RNN_ensemble")

### Explicabilité : Matrice de confusion et LIME

In [15]:
# val_loader_test = DataLoader(text_val, batch_size=1024, collate_fn=vectorize_test_batch)
# Y_preds_val = predict(rnn_classifier, val_loader_test)
# Y_actual_val = label_val[:, 1]

# print("Test Accuracy : {}".format(accuracy_score(Y_actual_val, Y_preds_val)))
# print("\nClassification Report : ")
# print(classification_report(Y_actual_val, Y_preds_val, target_names=target_classes))
# print("\nConfusion Matrix : ")
# print(confusion_matrix(Y_actual_val, Y_preds_val))

In [16]:
# X_test_text, Y_test = [], []
# for X, Y in merged_val:
#     X_test_text.append(X)
#     Y_test.append(Y)

# explainer = lime_text.LimeTextExplainer(class_names=target_classes, verbose=True)

# def make_predictions_lime(X_batch_text):
#     X = [vocab(tokenizer(text)) for text in X_batch_text]
#     X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.
#     logits = rnn_classifier(torch.tensor(X, dtype=torch.int32, device = device))
#     preds = F.softmax(logits, dim=-1)
#     return preds.detach().cpu().numpy()

# idx = int(random.uniform(0, len(Y_test), 1))
# X = [vocab(tokenizer(text)) for text in X_test_text[idx:idx+1]]
# X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.
# preds = rnn_classifier(torch.tensor(X, dtype=torch.int32, device = device))
# preds = F.softmax(preds, dim=-1)

# explanation = explainer.explain_instance(X_test_text[idx], classifier_fn=make_predictions_lime,
#                                          labels=Y_test[idx:idx+1])
# explanation.show_in_notebook()
# print("Prediction : ", target_classes[preds.argmax()])
# print("Actual :     ", target_classes[Y_test[idx]])