In [1]:
# if needed in Colab run the following command:
# !pip install torch==2.1.0 torchtext==0.16.0
# !pip install portalocker>=2.0.0

import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
import time

In [2]:
# Variables Globales
VAL_SIZE = 12500
TEST_SIZE = 12500
BATCH_SIZE = 8
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
DROPOUT = 0.2
NUM_LAYERS = 2
NUM_EPOCHS = 2
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


In [3]:
# Télécharger et tokeniser les données
tokenizer = get_tokenizer("basic_english")

# Construire le vocabulaire
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

train_iter = IMDB(split='train')
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>"])
vocab.set_default_index(vocab["<pad>"])  # Mot par défaut pour les mots inconnus
VOCAB_SIZE = len(vocab)
PAD_IDX = vocab["<pad>"]

In [4]:
# Fonction pour convertir le texte en indices
def text_pipeline(text):
    return vocab(tokenizer(text))

# Fonction pour préparer les étiquettes
def label_pipeline(label):
    return label - 1

# Préparer les données pour DataLoader
def collate_batch(batch):
    text_list, label_list = [], []
    for label, text in batch:
        text_list.append(torch.tensor(text_pipeline(text), dtype=torch.long))
        label_list.append(torch.tensor(label_pipeline(label), dtype=torch.float))
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])
    label_list = torch.tensor(label_list, dtype=torch.float)
    return text_list, label_list

train_iter, test_iter = IMDB()
val_datas, test_datas = random_split(list(test_iter), [VAL_SIZE, TEST_SIZE])
train_datas = list(train_iter)

train_loader = DataLoader(train_datas, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_datas, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_datas, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

# Print loaders' sizes
print(f"Taille du train loader : {len(train_loader.dataset)}")
print(f"Taille du validation loader : {len(val_loader.dataset)}")
print(f"Taille du test loader : {len(test_loader.dataset)}")

Taille du train loader : 25000
Taille du validation loader : 12500
Taille du test loader : 12500


In [5]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, pad_idx=PAD_IDX, embed_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, output_size=OUTPUT_DIM, dropout=DROPOUT, num_layers=NUM_LAYERS):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx, sparse=False)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return self.sigmoid(output)

In [6]:
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, embed_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, output_size=OUTPUT_DIM, dropout=DROPOUT, num_layers=NUM_LAYERS):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers=num_layers,
                          dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        last_hidden_state = gru_out[:, -1, :]
        output = self.fc(self.dropout(last_hidden_state))
        return self.sigmoid(output)

In [7]:
# Fonction pour entraîner le modèle
def train_epoch(model, data_loader, criterion, optimizer):
    model.train()
    total_loss, total_acc = 0, 0
    size_loader = 0
    for text, labels in tqdm(data_loader):
        text, labels = text.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        output = model(text).squeeze()
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += ((output >= 0.5) == labels).sum().item()
        size_loader += labels.size(0)
    return total_loss / size_loader, total_acc / size_loader

# Fonction pour évaluer le modèle
def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss, total_acc = 0, 0
    size_loader = 0
    with torch.no_grad():
        for text, labels in tqdm(data_loader):
            text, labels = text.to(DEVICE), labels.to(DEVICE)
            output = model(text).squeeze()
            loss = criterion(output, labels)
            total_loss += loss.item()
            total_acc += ((output >= 0.5) == labels).sum().item()
            size_loader += labels.size(0)
    return total_loss / size_loader, total_acc / size_loader

In [8]:
model = GRUClassifier()
model.to(DEVICE)
criterion = nn.BCELoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    end_time = time.time()

    print(f"Epoch: {epoch+1}/{NUM_EPOCHS} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Time: {end_time-start_time:.2f}s")

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch: 1/2 | Train Loss: 0.6972 | Train Acc: 0.5016 | Val Loss: 0.6933 | Val Acc: 0.5103 | Time: 36.83s


  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch: 2/2 | Train Loss: 0.4948 | Train Acc: 0.7342 | Val Loss: 0.2908 | Val Acc: 0.8809 | Time: 39.96s
