## DOCHERTY Ronan et HOUNGUEVOU Thomas
## Rapport réseaux de neurones
### Sujet: Génération de synopsis d'anime
Réseau utilisé: **Transformer**  
[Dataset](https://www.kaggle.com/datasets/marlesson/myanimelist-dataset-animes-profiles-reviews/data)

In [1]:
import torch
import torch.nn as nn


class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = self.values(values)  # (N, value_len, embed_size)
        keys = self.keys(keys)  # (N, key_len, embed_size)
        queries = self.queries(query)  # (N, query_len, embed_size)

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out


class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):

        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        # In the Encoder the query, key, value are all the same, it's in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out


class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out


class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=256,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device="cpu",
        max_length=100,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out



In [2]:
import pandas as pd

# Chargement des données depuis un fichier CSV (ou tout autre format de fichier)
data = pd.read_csv('animes.csv')

data = data[['title', 'synopsis','genre','score']]

# Affichage des premières lignes pour visualiser la structure des données
print(data.head())

                              title  \
0           Haikyuu!! Second Season   
1           Shigatsu wa Kimi no Uso   
2                     Made in Abyss   
3  Fullmetal Alchemist: Brotherhood   
4  Kizumonogatari III: Reiketsu-hen   

                                            synopsis  \
0  Following their participation at the Inter-Hig...   
1  Music accompanies the path of the human metron...   
2  The Abyss—a gaping chasm stretching down into ...   
3  "In order for something to be obtained, someth...   
4  After helping revive the legendary vampire Kis...   

                                               genre  score  
0  ['Comedy', 'Sports', 'Drama', 'School', 'Shoun...   8.82  
1  ['Drama', 'Music', 'Romance', 'School', 'Shoun...   8.83  
2  ['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...   8.83  
3  ['Action', 'Military', 'Adventure', 'Comedy', ...   9.23  
4   ['Action', 'Mystery', 'Supernatural', 'Vampire']   8.83  


In [3]:
from sklearn.model_selection import train_test_split

# Séparation des données en ensembles d'entraînement et de test (par exemple, 80% pour l'entraînement et 20% pour le test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Affichage de la taille des ensembles d'entraînement et de test
print("Taille de l'ensemble d'entraînement :", len(train_data))
print("Taille de l'ensemble de test :", len(test_data))

Taille de l'ensemble d'entraînement : 15448
Taille de l'ensemble de test : 3863


In [4]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

synopsis_list = train_data['synopsis'].astype(str).tolist()  # Assurez-vous que les données sont bien de type str

# Initialisation du tokenizer
tokenizer = Tokenizer(models.BPE())

# Configuration du trainer
trainer = trainers.BpeTrainer(vocab_size=10000, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

# Entraînement du tokenizer sur les synopsis nettoyés
tokenizer.train_from_iterator(synopsis_list, trainer=trainer)

# Tokenisation des synopsis d'entraînement
encoded_train_synopsis = [tokenizer.encode(synopsis).ids for synopsis in train_data['synopsis'].astype(str)]

# Tokenisation des synopsis de test
encoded_test_synopsis = [tokenizer.encode(synopsis).ids for synopsis in test_data['synopsis'].astype(str)]

# Préparation des données pour l'entraînement
# Assurez-vous que vos données sont de la bonne taille en ajoutant des paddings si nécessaire
# Utilisez les données tokenisées dans votre modèle

# Exemple de padding des données d'entraînement pour atteindre une longueur fixe
max_length = 100  # Longueur maximale souhaitée
padded_train_synopsis = [synopsis[:max_length] + [tokenizer.token_to_id('[PAD]')] * (max_length - len(synopsis[:max_length])) if len(synopsis) < max_length else synopsis[:max_length] for synopsis in encoded_train_synopsis]

# Conversion en tensors PyTorch
padded_train_tensors = torch.tensor(padded_train_synopsis)

# Faites de même pour les données de test si nécessaire
padded_test_synopsis = [synopsis[:max_length] + [tokenizer.token_to_id('[PAD]')] * (max_length - len(synopsis[:max_length])) if len(synopsis) < max_length else synopsis[:max_length] for synopsis in encoded_test_synopsis]

padded_test_tensors = torch.tensor(padded_test_synopsis)







In [5]:
# Convertir les données tokenisées en tensors PyTorch
padded_train_tensors = torch.tensor(padded_train_synopsis)
padded_test_tensors = torch.tensor(padded_test_synopsis)

# Définir les hyperparamètres d'entraînement
src_pad_idx = tokenizer.token_to_id('[PAD]')
trg_pad_idx = tokenizer.token_to_id('[PAD]')
src_vocab_size = len(tokenizer.get_vocab())
trg_vocab_size = len(tokenizer.get_vocab())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Créer une instance de votre modèle Transformer
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(device)

# Définir les paramètres d'entraînement
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

# Entraînement du modèle
def train_model(model, optimizer, criterion, train_data, device, epochs=10, batch_size=64):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for i in range(0, len(train_data), batch_size):
            src = train_data[i:i+batch_size].to(device)
            trg = train_data[i:i+batch_size].to(device)

            optimizer.zero_grad()
            output = model(src, trg)
            output_dim = output.shape[-1]
            output = output[:, :-1, :].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            print("num" + str(i))


        print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss / len(train_data)}')

# Entraîner le modèle
train_model(model, optimizer, criterion, padded_train_tensors, device)


num0
num64
num128
num192
num256
num320
num384
num448
num512
num576
num640
num704
num768
num832
num896
num960
num1024
num1088
num1152
num1216
num1280
num1344
num1408
num1472
num1536
num1600
num1664
num1728
num1792
num1856
num1920
num1984
num2048
num2112
num2176
num2240
num2304
num2368
num2432
num2496
num2560
num2624
num2688
num2752
num2816
num2880
num2944
num3008
num3072
num3136
num3200
num3264
num3328
num3392
num3456
num3520
num3584
num3648
num3712
num3776
num3840
num3904
num3968
num4032
num4096
num4160
num4224
num4288
num4352
num4416
num4480
num4544
num4608
num4672
num4736
num4800
num4864
num4928
num4992
num5056
num5120
num5184
num5248
num5312
num5376
num5440
num5504
num5568
num5632
num5696
num5760
num5824
num5888
num5952
num6016
num6080
num6144
num6208
num6272
num6336
num6400
num6464
num6528
num6592
num6656
num6720
num6784
num6848
num6912
num6976
num7040
num7104
num7168
num7232
num7296
num7360
num7424
num7488
num7552
num7616
num7680
num7744
num7808
num7872
num7936
num8000
num8064
num

In [6]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_layers, num_heads, hidden_size):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.transformer = nn.Transformer(
            d_model=embedding_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=hidden_size
        )
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, src):
        embedded = self.embedding(src)
        output = self.transformer(embedded, embedded)
        output = self.linear(output)
        return output

In [9]:
# Sauvegarder les tensors
torch.save(padded_train_tensors, 'padded_train_tensors.pt')
torch.save(padded_test_tensors, 'padded_test_tensors.pt')