<a href="https://colab.research.google.com/github/RMoulla/BDA/blob/main/LSTM_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install portalocker>=2.0.0

In [2]:
import torch
from torchtext.datasets import AG_NEWS
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
import torch.optim as optim

# Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Charger le dataset AG News
train_iter, test_iter = AG_NEWS()

# Tokenisation et construction du vocabulaire
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Création du modèle LSTM
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        final_out = lstm_out[:, -1, :]
        return self.fc(final_out)

# Paramètres du modèle
vocab_size = len(vocab)
embed_dim = 64
hidden_dim = 128
num_classes = 4  # Monde, Sports, Business, Science/Technologie

model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes).to(device)

# Fonction de perte et optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Préparation des données pour l'entraînement
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
         label_list.append(_label - 1)
         processed_text = torch.tensor(vocab(tokenizer(_text)), dtype=torch.int64)
         text_list.append(processed_text)
    return torch.tensor(label_list, dtype=torch.int64).to(device), nn.utils.rnn.pad_sequence(text_list, batch_first=True).to(device)

train_loader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)



def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, device):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        num_batches = 0
        for labels, text in train_loader:
            num_batches += 1
            labels, text = labels.to(device), text.to(device)
            optimizer.zero_grad()
            output = model(text)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        train_loss = total_loss / num_batches
        test_loss, test_accuracy = evaluate_model(model, test_loader, criterion, device)
        print(f'Epoch {epoch+1}, Train Loss: {train_loss}, Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_preds = 0
    total_samples = 0
    with torch.no_grad():
        for labels, text in test_loader:
            labels, text = labels.to(device), text.to(device)
            output = model(text)
            total_loss += criterion(output, labels).item()
            predictions = output.argmax(1)
            correct_preds += (predictions == labels).sum().item()
            total_samples += labels.size(0)
    return total_loss / total_samples, correct_preds / total_samples

# Entraînement et évaluation du modèle
num_epochs = 5
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, device)


Epoch 1, Train Loss: 0.5700466606092484, Test Loss: 0.04135168101148386, Test Accuracy: 0.8943421052631579
Epoch 2, Train Loss: 0.22892442001255695, Test Loss: 0.03543870934104473, Test Accuracy: 0.9105263157894737
Epoch 3, Train Loss: 0.1592835064956511, Test Loss: 0.035903294688530304, Test Accuracy: 0.9106578947368421
Epoch 4, Train Loss: 0.10874265675997885, Test Loss: 0.0384247496216803, Test Accuracy: 0.9126315789473685
Epoch 5, Train Loss: 0.07348871503821089, Test Loss: 0.042569012240595554, Test Accuracy: 0.915
