In [69]:
%%capture
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import requests
import tarfile
import os
from tqdm import tqdm

# L'architecture
Probablement la raison pourquoi vous regardez ce notebook, j'ai commenté ma démarche. Vous aller
retrouver la version anglaise des noms de plusieurs des concepts vues dans le blogue.  

In [2]:
class TransformerBlock(nn.Module):
    def __init__(self, 
                 embedding_size: int,
                 num_heads: int,
                 dropout: float,
                 forward_expansion: int=4
                 ):
        """Un bloc de transformation composé d'une couche de self-attention,
            de deux couches de normalisation, d'une couche de dropout et d'un réseau feedforward.

        Args:
            embedding_size (int): La taille des embeddings
            num_heads (int): Le nombre de tête de l'attention multi-tête
            dropout (float): Le % de dropout
            forward_expansion (int): La taille de la couche cachée du réseau feedforward par
                                    rapport à la taille des embeddings (4 dans le paper)
        """
        super(TransformerBlock, self).__init__()
        
        # La couche de self-attention
        self.multihead_attn = nn.MultiheadAttention(embed_dim=embedding_size, num_heads=num_heads)
        
        # On ajoute des couches de normalisation et de dropout (plus de performances)
        self.norm1 = nn.LayerNorm(embedding_size)
        self.norm2 = nn.LayerNorm(embedding_size)
        self.dropout = nn.Dropout(dropout)
        
        # Le réseau feedforward
        self.feed_forward = nn.Sequential(
            nn.Linear(embedding_size, forward_expansion * embedding_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embedding_size, embedding_size)
        )
    
    def forward(self, value, key, query, mask=None):
        # On passe les embeddings dans la couche de self-attention
        attn_output, attn_scores = self.multihead_attn(query, key, value)
        # Note: attn_scores permet de visualiser l'attention (si désiré)
        
        # On normalise et on dropout la sortie de la couche de self-attention avec la connexion résiduelle
        normalized_attn_out = self.dropout(self.norm1(attn_output + query)) 
        
        # On passe x dans le réseau feedforward
        forward_output = self.feed_forward(normalized_attn_out)
        
        # On normalise et on dropout la sortie du réseau feedforward avec la connexion résiduelle
        return self.dropout(self.norm2(forward_output + normalized_attn_out)), attn_scores

In [17]:
class TransformerClassifier(nn.Module):
    def __init__(self, 
                 vocab_size: int, 
                 seq_length: int,
                 embedding_size: int,
                 num_classes: int, 
                 num_layers: int, 
                 num_heads: int, 
                 dropout: float,
                 forward_expansion: int=4, 
                 ):
        """Un classifieur transformer.

        Args:
            vocab_size (int): La taille du vocabulaire, 
                            c'est-à-dire le nombre de tokens différents (sort du cadre de ce blogue)
            seq_length (int): La taille des séquences d'entrée
            embedding_size (int): La taille de nos embeddings
            num_classes (int): Le nombre de classes à prédire
            num_layers (int): Le nombre de blocs de transformation
            num_heads (int): Le nombre de têtes d'attention dans chaque bloc de transformation
            dropout (float): Le % de dropout à utiliser
            forward_expansion (int): La taille de la couche cachée du réseau feedforward par
                            rapport à la taille des embeddings (4 dans le paper)
        """
        super(TransformerClassifier, self).__init__()
        # Les embeddings des tokens
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        # Les embeddings de position
        self.position_embedding = nn.Embedding(seq_length, embedding_size)
        
        # Il y a num_layers blocs de transformation
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(embedding_size, num_heads, dropout, forward_expansion) for _ in range(num_layers)
        ])
        
        # Dropout pour éviter le sur-apprentissage (plus de performances)
        self.dropout = nn.Dropout(dropout)
        
        # On ajoute une couche linéaire pour la classification (sortir 2 classes)
        self.fc = nn.Linear(embedding_size, num_classes)
    
    def forward(self, x, mask=None):
        batch_size, seq_length = x.shape
        batch_positions = torch.arange(0, seq_length).expand(batch_size, seq_length).to(x.device)
        positioned_x = self.position_embedding(batch_positions) + self.embedding(x)
        out = self.dropout(positioned_x)
        
        for block in self.transformer_blocks:
            out = block(value=out, key=out, query=out, mask=mask)
        out = out.mean(dim=1)
        out = self.fc(out)
        return out

# Le jeux de données

https://paperswithcode.com/dataset/imdb-movie-reviews

In [4]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
r = requests.get(url)
with open("aclImdb_v1.tar.gz", "wb") as f:
    f.write(r.content)
    
with tarfile.open("aclImdb_v1.tar.gz", "r:gz") as tar:
    tar.extractall()

In [25]:
class ImdbDataset(torch.utils.data.Dataset):
    def __init__(self, phase, embedding_size):
        super().__init__()
        
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.phase = phase
        self.comments = []
        self.attention_masks = []
        self.labels = []
        for label in ["pos", "neg"]:
            folder_path = os.path.join("aclImdb", phase, label)
            for file_name in os.listdir(folder_path):
                with open(os.path.join(folder_path, file_name), "r") as f:
                    comment = f.read().lower()
                    tokenized_comment = self.tokenizer(comment, padding="max_length", truncation=True, max_length=embedding_size)
                    self.comments.append(tokenized_comment["input_ids"])
                    self.attention_masks.append(tokenized_comment["attention_mask"])
                    if label == "neg":
                        self.labels.append([0])
                    else:
                        self.labels.append([1])
                        
        assert len(self.comments) == len(self.labels) == len(self.attention_masks)
        
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, index):
        return torch.LongTensor(self.comments[index]), torch.LongTensor(self.attention_masks[index]), torch.LongTensor(self.labels[index])

# Entraînement d'un model

Ces hyperparamètres sont arbitraires.

Sauf vocab_size, qui est le nombre de tokens dans le BertTokenizer de HuggingFace 

In [32]:
# Define hyperparameters
lr = 1e-3
batch_size = 500
num_epochs = 10
vocab_size = 30522
embedding_size = 200
num_classes = 2
num_layers = 2
heads = 8
forward_expansion = 4
dropout = 0.2

In [33]:
# Define data loaders
train_dataset = ImdbDataset("train", embedding_size)
test_dataset = ImdbDataset("test", embedding_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


print(f"{len(train_dataset)} donnees d'entrainement")
print(f"{len(test_dataset)} donnees de test")

data = next(iter(train_loader))
print(data[0].shape)
print(data[1].shape)
print(data[2].shape)

25000 donnees d'entrainement
25000 donnees de test
torch.Size([500, 200])
torch.Size([500, 200])
torch.Size([500, 1])


In [34]:
# Define the model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerClassifier(vocab_size=vocab_size, 
                 seq_length=embedding_size,
                 embedding_size=embedding_size,
                 num_classes=num_classes, 
                 num_layers=num_layers, 
                 num_heads=heads, 
                 dropout=dropout,
                 forward_expansion=forward_expansion).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

output = model(data[0].to(device))
print(output)


tensor([[ 0.1777, -0.1561],
        [ 0.2121, -0.2174],
        [ 0.1872, -0.1525],
        [ 0.2167, -0.2095],
        [ 0.1825, -0.1686],
        [ 0.2439, -0.1700],
        [ 0.2166, -0.2331],
        [ 0.2802, -0.1709],
        [ 0.2227, -0.2128],
        [ 0.1927, -0.1581],
        [ 0.2472, -0.2224],
        [ 0.1911, -0.1556],
        [ 0.3271, -0.2087],
        [ 0.1938, -0.2079],
        [ 0.2350, -0.1938],
        [ 0.2579, -0.2449],
        [ 0.1463, -0.2378],
        [ 0.2679, -0.2249],
        [ 0.1996, -0.1564],
        [ 0.2239, -0.2625],
        [ 0.2328, -0.1913],
        [ 0.2347, -0.1004],
        [ 0.1794, -0.1617],
        [ 0.2817, -0.2058],
        [ 0.1961, -0.1419],
        [ 0.1938, -0.2175],
        [ 0.2726, -0.2031],
        [ 0.2253, -0.1753],
        [ 0.2078, -0.2520],
        [ 0.2527, -0.1760],
        [ 0.2486, -0.1848],
        [ 0.2186, -0.0845],
        [ 0.2229, -0.1824],
        [ 0.2435, -0.1611],
        [ 0.2106, -0.2147],
        [ 0.2114, -0

In [35]:
# Define the training loop
for _ in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0
    for x, _, y in tqdm(train_loader):
        x, y = x.to(device), y.squeeze(1).to(device)
        optimizer.zero_grad()
        y_pred, _ = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * x.size(0)
        train_acc += (y_pred.argmax(dim=1) == y).sum().item()
    train_loss /= len(train_dataset)
    train_acc /= len(train_dataset)
    print(f"Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f}")

    model.eval()
    test_loss, test_acc = 0, 0
    with torch.no_grad():
        for x, _, y in tqdm(test_loader):
            x, y = x.to(device), y.squeeze(1).to(device)
            y_pred, _ = model(x)
            loss = loss_fn(y_pred, y)
            test_loss += loss.item() * x.size(0)
            test_acc += (y_pred.argmax(dim=1) == y).sum().item()
    test_loss /= len(test_dataset)
    test_acc /= len(test_dataset)
    print(f"Test loss: {test_loss:.4f}, Test acc: {test_acc:.4f}")

100%|██████████| 50/50 [06:43<00:00,  8.08s/it]


Train loss: 0.7371, Train acc: 0.5056


100%|██████████| 50/50 [02:54<00:00,  3.49s/it]


Test loss: 0.6930, Test acc: 0.5206


100%|██████████| 50/50 [06:46<00:00,  8.13s/it]


Train loss: 0.6774, Train acc: 0.5741


100%|██████████| 50/50 [02:47<00:00,  3.35s/it]


Test loss: 0.6473, Test acc: 0.6212


100%|██████████| 50/50 [06:36<00:00,  7.92s/it]


Train loss: 0.5920, Train acc: 0.6890


100%|██████████| 50/50 [02:53<00:00,  3.47s/it]


Test loss: 0.6156, Test acc: 0.6675


100%|██████████| 50/50 [06:45<00:00,  8.11s/it]


Train loss: 0.5421, Train acc: 0.7309


100%|██████████| 50/50 [02:56<00:00,  3.53s/it]


Test loss: 0.5768, Test acc: 0.6987


100%|██████████| 50/50 [06:40<00:00,  8.02s/it]


Train loss: 0.5105, Train acc: 0.7524


100%|██████████| 50/50 [02:54<00:00,  3.50s/it]


Test loss: 0.5684, Test acc: 0.7098


100%|██████████| 50/50 [06:42<00:00,  8.05s/it]


Train loss: 0.4868, Train acc: 0.7682


100%|██████████| 50/50 [02:55<00:00,  3.51s/it]


Test loss: 0.5586, Test acc: 0.7237


100%|██████████| 50/50 [06:43<00:00,  8.07s/it]


Train loss: 0.4650, Train acc: 0.7820


100%|██████████| 50/50 [02:55<00:00,  3.51s/it]


Test loss: 0.5487, Test acc: 0.7306


100%|██████████| 50/50 [06:42<00:00,  8.06s/it]


Train loss: 0.4411, Train acc: 0.7958


100%|██████████| 50/50 [02:56<00:00,  3.53s/it]


Test loss: 0.5542, Test acc: 0.7328


100%|██████████| 50/50 [06:43<00:00,  8.06s/it]


Train loss: 0.4327, Train acc: 0.8020


100%|██████████| 50/50 [02:56<00:00,  3.53s/it]


Test loss: 0.5627, Test acc: 0.7309


100%|██████████| 50/50 [06:55<00:00,  8.32s/it]


Train loss: 0.4075, Train acc: 0.8182


100%|██████████| 50/50 [03:05<00:00,  3.70s/it]

Test loss: 0.5463, Test acc: 0.7418





On atteint environ 75% d'accuracy en test. On est loin de l'état de l'art (96%):
https://paperswithcode.com/sota/sentiment-analysis-on-imdb

# Inférence

In [66]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model.eval()

with torch.no_grad():
    easy = ("This film is terrible. You don't really need to read this review further."
        "If you are planning on watching it, suffice to say - don't (unless you are studying how not to make a good movie)."
        "The acting is horrendous... serious amateur hour. Even by the standard of Hollywood action flicks, this is a terrible movie."
        "Don't watch it!!! Go for a jog instead - at least you won't feel like killing yourself.")
    easy_tokens = torch.LongTensor(tokenizer(easy, padding="max_length", truncation=True, max_length=embedding_size)["input_ids"]).to(device)
    easy_output = torch.softmax(model(easy_tokens.unsqueeze(0)).squeeze(0), dim=0)
    easy_decision = "positif" if easy_output.argmax().item() == 1 else "négatif"
    conf_score = easy_output[easy_output.argmax().item()].item() * 100
    print(f"Le modèle pense avec {conf_score:.2f}% de certitude que commentaire:\n\n\t {easy}\n\n est: {easy_decision}.", "-"*50, sep="\n")

    

    medium = "This movie was not bad."
    medium_tokens = torch.LongTensor(tokenizer(medium, padding="max_length", truncation=True, max_length=embedding_size)["input_ids"]).to(device)
    medium_output = torch.softmax(model(medium_tokens.unsqueeze(0)).squeeze(0), dim=0)
    medium_decision = "positif" if medium_output.argmax().item() == 1 else "négatif"
    conf_score = medium_output[medium_output.argmax().item()].item() * 100
    print(f"Le modèle pense avec {conf_score:.2f}% de certitude que commentaire:\n\n\t {medium}\n\n est: {medium_decision}.", "-"*50, sep="\n")

    hard = "This movie was okay. I didn't absolutely love it, but I didn't hate it either."
    hard_tokens = torch.LongTensor(tokenizer(hard, padding="max_length", truncation=True, max_length=embedding_size)["input_ids"]).to(device)
    hard_output = torch.softmax(model(hard_tokens.unsqueeze(0)).squeeze(0), dim=0)
    hard_decision = "positif" if hard_output.argmax().item() == 1 else "négatif"
    conf_score = hard_output[hard_output.argmax().item()].item() * 100
    print(f"Le modèle pense avec {conf_score:.2f}% de certitude que commentaire:\n\n\t {hard}\n\n est: {hard_decision}.", "-"*50, sep="\n")


Le modèle pense avec 60.45% de certitude que commentaire:

	 This film is terrible. You don't really need to read this review further.If you are planning on watching it, suffice to say - don't (unless you are studying how not to make a good movie).The acting is horrendous... serious amateur hour. Even by the standard of Hollywood action flicks, this is a terrible movie.Don't watch it!!! Go for a jog instead - at least you won't feel like killing yourself.

 est: négatif.
--------------------------------------------------
Le modèle pense avec 80.03% de certitude que commentaire:

	 This movie was not bad.

 est: positif.
--------------------------------------------------
Le modèle pense avec 75.65% de certitude que commentaire:

	 This movie was okay. I didn't absolutely love it, but I didn't hate it either.

 est: positif.
--------------------------------------------------
