In [16]:
import utils_data as utils
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torch.optim import Adam

from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

device = "cuda" if torch.cuda.is_available() else "cpu"

### Importation du dataset

In [17]:
data_train, data_val = utils.get_data()
label_train, label_val = utils.get_labels()
data_test = utils.get_test()

merged_train = utils.merge_data_labels(data_train, label_train)
merged_val = utils.merge_data_labels(data_val, label_val)

Note importante : toute la suite est inspiré de : https://coderzcolumn.com/tutorials/artificial-intelligence/pytorch-rnn-for-text-classification-tasks, ainsi que du cours IFT 6135-A2022.

### Création de tokens à partir des phrases

In [18]:
tokenizer = get_tokenizer("basic_english")

def build_vocabulary(datasets):
    for dataset in datasets:
        for text in dataset:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocabulary([data_train, data_val, data_test]), min_freq=1, specials=["<UNK>"])
vocab.set_default_index(vocab["<UNK>"])
len(vocab)

579310

### Préparation des données

In [19]:
data_train, data_val = to_map_style_dataset(merged_train), to_map_style_dataset(merged_val)
data_test = to_map_style_dataset(data_test)
target_classes = ["negative", "neutral", "positive"]
max_words = 100

def vectorize_batch(batch):
    X, Y = list(zip(*batch))
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32), torch.tensor(Y, dtype = torch.long)

def vectorize_test_batch(batch):
    X = list(batch)
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32)

train_loader = DataLoader(data_train, batch_size=1024, collate_fn=vectorize_batch, shuffle=True)
val_loader = DataLoader(data_val, batch_size=1024, collate_fn=vectorize_batch)
test_loader = DataLoader(data_test, batch_size=1024, collate_fn=vectorize_test_batch)

### Définition du RNN

In [20]:
embed_len = 100
hidden_dim = 50
n_layers = 1

class RNNClassifier(nn.Module):
    def __init__(self):
        super(RNNClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.rnn = nn.RNN(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, len(target_classes))

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch).to(device)
        output, hidden = self.rnn(embeddings, torch.randn(n_layers, len(X_batch), hidden_dim, device = device))
        return self.linear(output[:,-1])

rnn_classifier = RNNClassifier().to(device)
rnn_classifier

RNNClassifier(
  (embedding_layer): Embedding(579310, 100)
  (rnn): RNN(100, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=3, bias=True)
)

In [21]:
for layer in rnn_classifier.children():
    print("Layer : {}".format(layer))
    print("Parameters : ")
    for param in layer.parameters():
        print(param.shape)
    print()

Layer : Embedding(579310, 100)
Parameters : 
torch.Size([579310, 100])

Layer : RNN(100, 50, batch_first=True)
Parameters : 
torch.Size([50, 100])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50])

Layer : Linear(in_features=50, out_features=3, bias=True)
Parameters : 
torch.Size([3, 50])
torch.Size([3])



### Entraînement du RNN

In [22]:
def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            X, Y = X.to(device), Y.to(device)
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().cpu().numpy(), Y_preds.detach().cpu().numpy())))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            X, Y = X.to(device), Y.to(device)
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [23]:
epochs = 15
learning_rate = 1e-3

rnn_classifier = RNNClassifier().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(rnn_classifier.parameters(), lr=learning_rate)

TrainModel(rnn_classifier, loss_fn, optimizer, train_loader, val_loader, epochs)

100%|██████████| 813/813 [00:27<00:00, 29.56it/s]


Train Loss : 0.699
Valid Loss : 0.694
Valid Acc  : 0.502


100%|██████████| 813/813 [00:27<00:00, 29.31it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.498


100%|██████████| 813/813 [00:27<00:00, 29.58it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.498


100%|██████████| 813/813 [00:27<00:00, 29.66it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.498


100%|██████████| 813/813 [00:27<00:00, 29.39it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.502


100%|██████████| 813/813 [00:27<00:00, 29.62it/s]


Train Loss : 0.694
Valid Loss : 0.695
Valid Acc  : 0.498


100%|██████████| 813/813 [00:27<00:00, 29.56it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.498


100%|██████████| 813/813 [00:28<00:00, 28.59it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.498


100%|██████████| 813/813 [00:27<00:00, 29.65it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.498


100%|██████████| 813/813 [00:27<00:00, 29.55it/s]


Train Loss : 0.694
Valid Loss : 0.695
Valid Acc  : 0.498


100%|██████████| 813/813 [00:27<00:00, 29.25it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.502


100%|██████████| 813/813 [00:27<00:00, 29.43it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.502


100%|██████████| 813/813 [00:27<00:00, 29.36it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.502


100%|██████████| 813/813 [00:27<00:00, 29.10it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.498


100%|██████████| 813/813 [00:27<00:00, 29.40it/s]


Train Loss : 0.694
Valid Loss : 0.694
Valid Acc  : 0.498


### Prédictions sur l'ensemble de test

In [24]:
def MakePredictions(model, loader):
    Y_preds = []
    for X in tqdm(loader):
        X = X.to(device)
        preds = model(X)
        Y_preds.append(preds.detach().cpu())
    gc.collect()
    Y_preds = torch.cat(Y_preds)

    return F.softmax(Y_preds, dim=-1).argmax(dim=-1).numpy()

Y_preds = MakePredictions(rnn_classifier, test_loader)

100%|██████████| 1016/1016 [00:21<00:00, 47.74it/s]


[2 2 2 2 2 2 2 2 2 2]


### Enregistrement des résultats

In [25]:
utils.save_results(Y_preds, "RNN")