In [1]:
import copy
import numpy as np
import math
import torch

######################################################################################################## Avoir plusieurs modudes avec des références différentes
def clones(module, n):
    return torch.nn.ModuleList([copy.deepcopy(module) for _ in range(n)])

######################################################################################################## Scaled Dot Product Attention
def ScaledDotProductAttention(Q, K, V, mask = None):
  """
  Q : Query head ; shape : batch x n_head x n_token x d_head
  K : Key head ; shape : batch x n_head x n_token x d_head
  V : Value head ; shape : batch x n_head x n_token x d_head
  """
  d_head = Q.size(-1)
  scores = torch.matmul(Q, K.transpose(-2,-1)) / math.sqrt(d_head) # shape : batch * n_head * n_token * n_token
  if mask is not None:
    scores = scores.masked_fill_(mask==0, -1e4)

  weights = scores.softmax(-1)
  return torch.matmul(weights, V), weights

######################################################################################################## Multi-Head Attention
class Multi_Head_Attention(torch.nn.Module):
  def __init__(self, n_head, d_model, dropout=0.1):
    super(Multi_Head_Attention, self).__init__()
    assert d_model % n_head == 0
    self.n_head = n_head
    self.d_head = d_model // n_head
    self.layers = clones(torch.nn.Linear(d_model, d_model), 4)
    self.weights = None
    self.dropout = torch.nn.Dropout(dropout)

  def forward(self, Q, K, V, mask=None):
    """
    Q : Query embedding ; shape : batch x n_token x d_model
    K : Key embedding ; shape : batch x n_token x d_model
    V : Value embedding ; shape : batch x n_token x d_model
    """
    if mask is not None:
      mask = mask.unsqueeze(1)

    batch_size = Q.size(0)
    Q, K, V = [l(x).view(batch_size, -1, self.n_head, self.d_head).transpose(1,2) for l, x in zip(self.layers, (Q, K, V))] # batch x n_head x n_token x d_head
    x, self.weights = ScaledDotProductAttention(Q, K, V, mask=mask) # batch x n_head x n_token x d_head
    x = x.transpose(1,2).contiguous().view(batch_size, -1, self.n_head * self.d_head) # batch x n_token x d_model
    return self.layers[-1](self.dropout(x))

######################################################################################################## Double FFN
class FFN(torch.nn.Module):
  def __init__(self, d_model, d_ffn, dropout=0.1):
    super(FFN, self).__init__()
    self.layer1 = torch.nn.Linear(d_model, d_ffn)
    self.layer2 = torch.nn.Linear(d_ffn, d_model)
    self.dropout = torch.nn.Dropout(dropout)

  def forward(self, x):
    return self.layer2(self.dropout(torch.nn.functional.silu(self.layer1(x))))

######################################################################################################## Addition et normalisation pre-in
class PreNormResidual(torch.nn.Module):
  def __init__(self, d_model, dropout=0.1):
    super(PreNormResidual, self).__init__()
    self.norm = torch.nn.LayerNorm(d_model)
    self.dropout = torch.nn.Dropout(dropout)

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.norm(x)))

######################################################################################################## Encoder block
class EncoderBlock(torch.nn.Module):
  def __init__(self, n_head, d_model, d_ffn):
    super(EncoderBlock, self).__init__()
    self.MHA = Multi_Head_Attention(n_head, d_model)
    self.Norm_And_Add = clones(PreNormResidual(d_model), 2)
    self.FFN = FFN(d_model, d_ffn)

  def forward(self, x, source_mask):
    """
    x : Input ; shape : batch x n_token x d_model
    source_mask : Mask for source sentence (padding tokens are masked) ; shape : batch x n_token x n_token
    """
    x = self.Norm_And_Add[0](x, lambda x: self.MHA(x, x, x, mask=source_mask))
    return self.Norm_And_Add[1](x, self.FFN)

######################################################################################################## Encoder
class Encoder(torch.nn.Module):
  def __init__(self, n_head, d_model, d_ffn, N):
    super(Encoder, self).__init__()
    self.layers = clones(EncoderBlock(n_head, d_model, d_ffn), N)
    self.norm = torch.nn.LayerNorm(d_model)

  def forward(self, x, source_mask):
    for layer in self.layers:
      x = layer(x, source_mask)
    return self.norm(x)

######################################################################################################## Decoder block
class DecoderBlock(torch.nn.Module):
  def __init__(self, n_head, d_model, d_ffn):
    super(DecoderBlock, self).__init__()
    self.MHA_Masked = Multi_Head_Attention(n_head, d_model)
    self.MHA_Encoder = Multi_Head_Attention(n_head, d_model)
    self.Norm_And_Add = clones(PreNormResidual(d_model), 3)
    self.FFN = FFN(d_model, d_ffn)

  def forward(self, x, encoder_output, cross_mask, target_mask):
    """
    x : Output ; shape : batch x n_token x d_model
    encoder_output : Encoder final states ; shape : batch x n_token x d_model
    cross_mask : Mask for cross attention (padding tokens are masked) ; shape : batch x n_token x n_token
    target_mask : Mask for target sentence (future and padding tokens are masked) ; shape : batch x n_token x n_token
    """
    x = self.Norm_And_Add[0](x, lambda x: self.MHA_Masked(x, x, x, mask=target_mask))
    x = self.Norm_And_Add[1](x, lambda x: self.MHA_Encoder(x, encoder_output, encoder_output, mask=cross_mask))
    return self.Norm_And_Add[2](x, self.FFN)

######################################################################################################## Decoder
class Decoder(torch.nn.Module):
  def __init__(self, target_vocab_size, n_head, d_model, d_ffn, N):
    super(Decoder, self).__init__()
    self.layers = clones(DecoderBlock(n_head, d_model, d_ffn), N)
    self.norm = torch.nn.LayerNorm(d_model)
    self.final_layer = torch.nn.Linear(d_model, target_vocab_size)

  def forward(self, x, encoder_output, cross_mask, target_mask):
    for layer in self.layers:
      x = layer(x, encoder_output, cross_mask, target_mask)
    x = self.norm(x)
    return torch.nn.functional.log_softmax(self.final_layer(x), dim=-1)

######################################################################################################## Encoder + Decoder
class EncoderDecoder(torch.nn.Module):
  def __init__(self, target_vocab_size, n_head, d_model, d_ffn, N):
    super(EncoderDecoder, self).__init__()
    self.encoder = Encoder(n_head, d_model, d_ffn, N)
    self.decoder = Decoder(target_vocab_size, n_head, d_model, d_ffn, N)

  def forward(self, source, target, source_mask, target_mask, cross_mask):
    encoder_output = self.encoder(source, source_mask)
    return self.decoder(target, encoder_output, cross_mask, target_mask)

######################################################################################################## Embedding
class Embeddings(torch.nn.Module):
  def __init__(self, voc_size, d_model):
    super(Embeddings, self).__init__()
    self.embedding = torch.nn.Embedding(voc_size, d_model)
    self.d_model = d_model

  def forward(self, x):
    """
    x : Input (int) ; shape : batch * n_token
    """
    return self.embedding(x) * math.sqrt(self.d_model)

######################################################################################################## Positional encoding
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

######################################################################################################## Build Transformer
class Transformer(torch.nn.Module):
  def __init__(self, source_vocab_size, target_vocab_size, n_head, d_model, d_ffn, N):
    super(Transformer, self).__init__()
    self.source_embedding = Embeddings(source_vocab_size, d_model)
    self.target_embedding = Embeddings(target_vocab_size, d_model)
    self.PE = PositionalEncoding(d_model)
    self.encoder_decoder = EncoderDecoder(target_vocab_size, n_head, d_model, d_ffn, N)
    for p in self.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

  def forward(self, source, target, source_mask, target_mask, cross_mask):
    """
    source : Input (int) ; shape : batch * n_token
    target : Output (int) ; shape : batch * n_token
    """
    source = self.PE(self.source_embedding(source))
    target = self.PE(self.target_embedding(target))
    output = self.encoder_decoder(source, target, source_mask, target_mask, cross_mask)
    return output

In [2]:
def get_key_from_id(voc, id):
  key = next((k for k, v in voc.items() if v == id), None)
  return key

In [3]:
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pandas as pd
df = pd.read_csv("hf://datasets/FrancophonIA/english_french/train.csv")


[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [4]:
source_voc = {}
source_voc["[PAD]"] = 0; source_voc["[UNK]"] = 1; source_voc["[START]"] = 2; source_voc["[END]"] = 3
target_voc = {}
target_voc["[PAD]"] = 0; target_voc["[UNK]"] = 1; target_voc["[START]"] = 2; target_voc["[END]"] = 3

def source_tokenize_get_voc(row):
  source_tokens = nltk.word_tokenize(row['french'].lower(), language='french')
  for token in source_tokens:
    if token not in source_voc:
      source_voc[token] = len(source_voc)
  return source_tokens

def target_tokenize_get_voc(row):
  target_tokens = nltk.word_tokenize(row['english'].lower())
  for token in target_tokens:
    if token not in target_voc:
      target_voc[token] = len(target_voc)
  return ["[START]"] + target_tokens + ["[END]"]

df['tokenized_fr'] = df.apply(source_tokenize_get_voc, axis=1)
print('Done')
df['tokenized_en'] = df.apply(target_tokenize_get_voc, axis=1)
print('Done')

Done
Done


In [5]:
def source_seq_to_int(row, max_length=100):
  sequence_copy = copy.deepcopy(row['tokenized_fr'])
  if max_length is not None:
    sequence_copy = sequence_copy[:max_length]
  sequence_copy = [source_voc.get(word, source_voc["[UNK]"]) for word in sequence_copy]
  return sequence_copy

def target_seq_to_int(row, max_length=100):
  sequence_copy = copy.deepcopy(row['tokenized_en'])
  if max_length is not None:
    sequence_copy = sequence_copy[:max_length]

  sequence_copy = [target_voc.get(word, target_voc["[UNK]"]) for word in sequence_copy]
  return sequence_copy

df['int_fr'] = df.apply(source_seq_to_int, axis=1)
print('Done')
df['int_en'] = df.apply(target_seq_to_int, axis=1)
print('Done')

Done
Done


In [6]:
df = df[['french', 'tokenized_fr', 'int_fr', 'english', 'tokenized_en', 'int_en']]
df

Unnamed: 0,french,tokenized_fr,int_fr,english,tokenized_en,int_en
0,elle conduisait la voiture noire brillante.,"[elle, conduisait, la, voiture, noire, brillan...","[4, 5, 6, 7, 8, 9, 10]",she was driving the shiny black car.,"[[START], she, was, driving, the, shiny, black...","[2, 4, 5, 6, 7, 8, 9, 10, 11, 3]"
1,"chine est jamais calme à l'automne, mais il es...","[chine, est, jamais, calme, à, l'automne, ,, m...","[11, 12, 13, 14, 15, 16, 17, 18, 19, 12, 20, 2...","china is never quiet during autumn, but it is ...","[[START], china, is, never, quiet, during, aut...","[2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 13, 21..."
2,N'es-tu pas surprise de me voir ici ?,"[n'es-tu, pas, surprise, de, me, voir, ici, ?]","[24, 25, 26, 27, 28, 29, 30, 31]",Aren't you surprised to see me here?,"[[START], are, n't, you, surprised, to, see, m...","[2, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3]"
3,Elle divorça de son mari.,"[elle, divorça, de, son, mari, .]","[4, 32, 27, 33, 34, 10]",She divorced her husband.,"[[START], she, divorced, her, husband, ., [END]]","[2, 4, 34, 35, 36, 11, 3]"
4,"paris est parfois le gel en juin, mais il est ...","[paris, est, parfois, le, gel, en, juin, ,, ma...","[35, 12, 20, 36, 37, 22, 23, 17, 18, 19, 12, 3...","paris is sometimes freezing during june, but i...","[[START], paris, is, sometimes, freezing, duri...","[2, 37, 13, 21, 38, 16, 24, 18, 19, 20, 13, 39..."
...,...,...,...,...,...,...
320157,"chine est calme au mois de novembre, et il est...","[chine, est, calme, au, mois, de, novembre, ,,...","[11, 12, 14, 109, 116, 27, 200, 17, 44, 19, 12...","china is quiet during november, and it is neve...","[[START], china, is, quiet, during, november, ...","[2, 12, 13, 15, 16, 185, 18, 43, 20, 13, 14, 3..."
320158,pensez - vous que la traduction entre est diff...,"[pensez, -, vous, que, la, traduction, entre, ...","[2860, 406, 88, 223, 6, 1154, 298, 12, 1068, 1...",do you think translating between chinese and e...,"[[START], do, you, think, translating, between...","[2, 164, 27, 104, 902, 273, 1232, 43, 192, 13,..."
320159,Pouvez-vous supporter la manière avec laquelle...,"[pouvez-vous, supporter, la, manière, avec, la...","[1935, 536, 6, 938, 206, 1041, 19, 241, 8164, 31]",Can you put up with the way he behaves?,"[[START], can, you, put, up, with, the, way, h...","[2, 672, 27, 146, 422, 189, 7, 578, 63, 6462, ..."
320160,il va en inde en mai prochain.,"[il, va, en, inde, en, mai, prochain, .]","[19, 700, 22, 42, 22, 99, 814, 10]",he is going to india next may.,"[[START], he, is, going, to, india, next, may,...","[2, 63, 13, 369, 29, 41, 664, 103, 11, 3]"


In [7]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src_int = torch.tensor(self.data.iloc[idx]['int_fr'])
        tgt_int = torch.tensor(self.data.iloc[idx]['int_en'])

        return src_int, tgt_int

dataset = TranslationDataset(df)

In [8]:
def create_source_mask(source_tensor):
  mask = source_tensor != 0
  mask = mask.unsqueeze(1) * torch.ones(source_tensor.size(-1)).unsqueeze(1)
  return mask.bool()

def create_target_mask(target_tensor):
  mask = target_tensor != 0
  mask = mask.unsqueeze(1) * torch.ones(target_tensor.size(-1)).unsqueeze(1)
  mask = mask * torch.tril(torch.ones(mask.size(1), mask.size(1))).unsqueeze(0)
  return mask.bool()

def create_cross_mask(source_tensor, target_tensor):
  mask = source_tensor != 0
  mask = mask.unsqueeze(1) * torch.ones(target_tensor.size(-1)).unsqueeze(1)
  return mask.bool()

In [9]:
from torch.utils.data import DataLoader

def collate_fn(batch):
  src_batch, tgt_batch = zip(*batch)
  src_input = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True)
  tgt_input = [t[:-1] for t in tgt_batch]
  tgt_output = [t[1:] for t in tgt_batch]
  tgt_input = torch.nn.utils.rnn.pad_sequence(tgt_input, batch_first=True)
  tgt_output = torch.nn.utils.rnn.pad_sequence(tgt_output, batch_first=True)
  return src_input.long(), tgt_input.long(), tgt_output.long(), create_source_mask(src_input), create_target_mask(tgt_input), create_cross_mask(src_input, tgt_input)

dataloader = DataLoader(dataset, batch_size=256, shuffle=True, collate_fn=collate_fn, num_workers=4, pin_memory=True)

In [10]:
import time

def train_model(model, loss_function, optimizer, data_loader, scheduler=None):
    scaler = torch.amp.GradScaler()
    start_time = time.time()
    progress_interval = len(data_loader) // 3
    
    model.train()

    current_loss = 0.0
    counter = 0

    for i, (src_input, tgt_input, tgt_output, src_mask, tgt_mask, cross_mask) in enumerate(data_loader):
        optimizer.zero_grad()
        
        src_input = src_input.to(device, non_blocking=True)
        tgt_input = tgt_input.to(device, non_blocking=True)
        tgt_output = tgt_output.to(device, non_blocking=True)
        src_mask = src_mask.to(device, non_blocking=True)
        tgt_mask = tgt_mask.to(device, non_blocking=True)
        cross_mask = cross_mask.to(device, non_blocking=True)

        with torch.amp.autocast(device_type='cuda:0', dtype=torch.float16): 
            out = model(src_input, tgt_input, src_mask, tgt_mask, cross_mask)

            loss = loss_function(out.contiguous().view(-1, out.size(-1)),
                                tgt_output.contiguous().view(-1))

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        current_loss += loss
        counter += 1

        if (i + 1) % progress_interval == 0 or (i + 1) == len(data_loader):
            percent_complete = (i + 1) / len(data_loader) * 100
            print(f"Progress: {percent_complete:.1f}% - Batch: {i + 1}/{len(data_loader)}; Loss: {current_loss / counter:.6f}")
            print(f"Lr: {scheduler.get_last_lr()[0]:.6f}")
            current_loss = 0.0
            counter = 0
            
    end_time = time.time()  # Arrête le chronomètre
    elapsed_time = end_time - start_time
    print(f"Terminé en {elapsed_time:.2f} secondes")

In [16]:
if __name__ == '__main__':
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    n_epoch = 12
    
    model = Transformer(len(source_voc), len(target_voc), 6, 360, 1440, 4)
    model = torch.nn.DataParallel(model)
    model = model.to(device)

    loss_function = torch.nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5, weight_decay=0.1)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, steps_per_epoch=len(dataloader), epochs=n_epoch)

    for i in range(n_epoch):
        print("Epoch :", i+1)
        train_model(model, loss_function, optimizer, dataloader, scheduler)

Epoch : 1
Progress: 33.3% - Batch: 417/1251; Loss: 5.579123
Lr: 0.000060
Progress: 66.7% - Batch: 834/1251; Loss: 2.491803
Lr: 0.000119
Progress: 100.0% - Batch: 1251/1251; Loss: 1.527998
Lr: 0.000212
Terminé en 184.72 secondes
Epoch : 2
Progress: 33.3% - Batch: 417/1251; Loss: 1.054403
Lr: 0.000330
Progress: 66.7% - Batch: 834/1251; Loss: 0.832548
Lr: 0.000464
Progress: 100.0% - Batch: 1251/1251; Loss: 0.706036
Lr: 0.000604
Terminé en 185.01 secondes
Epoch : 3
Progress: 33.3% - Batch: 417/1251; Loss: 0.578533
Lr: 0.000736
Progress: 66.7% - Batch: 834/1251; Loss: 0.551993
Lr: 0.000850
Progress: 100.0% - Batch: 1251/1251; Loss: 0.527901
Lr: 0.000936
Terminé en 185.22 secondes
Epoch : 4
Progress: 33.3% - Batch: 417/1251; Loss: 0.433877
Lr: 0.000987
Progress: 66.7% - Batch: 834/1251; Loss: 0.432932
Lr: 0.001000
Progress: 100.0% - Batch: 1251/1251; Loss: 0.423294
Lr: 0.000994
Terminé en 184.09 secondes
Epoch : 5
Progress: 33.3% - Batch: 417/1251; Loss: 0.341603
Lr: 0.000981
Progress: 66.7%

In [17]:
def get_input(sequence, voc):
  sequence_copy = copy.deepcopy(sequence)
  sequence_copy = [voc.get(word, source_voc["[UNK]"]) for word in sequence_copy]
  return sequence_copy

In [18]:
def infer(sequences_fr, model, device):
    batch_size = len(sequences_fr)
    model.eval()
    input_fr = []
    src_mask = []
    for seq in sequences_fr:
        input_fr.append(torch.tensor([source_voc.get(word, source_voc["[UNK]"]) for word in nltk.word_tokenize(seq[0].lower(), language='french')]).unsqueeze(0).long())
        src_mask.append(create_source_mask(input_fr[-1]))
    sequence_en = [[] for _ in range(batch_size)]
    res = []
    for i in range(batch_size):
        next_token = 2
        while next_token != 3:
            sequence_en[i].append(next_token)
            input_en = torch.tensor(sequence_en[i]).unsqueeze(0).long()
            tgt_mask = create_target_mask(input_en)
            cross_mask = create_cross_mask(input_fr[i], input_en)
            log_logits = model(input_fr[i].to(device), input_en.to(device), src_mask[i].to(device), tgt_mask.to(device), cross_mask.to(device))
            next_token = torch.argmax(log_logits[:, -1], dim=-1).item()
        res.append([' '.join([get_key_from_id(target_voc, i) for i in sequence_en[i][1:]])])
    return res

In [53]:
seq_fr = [
    ["Il pleut beaucoup aujourd'hui, donc nous restons à la maison."],
    ["Je parle beaucoup mieux anglais que toi."],
    ["Nous avons mangé des crêpes pour le petit déjeuner."],
    ["Le chien court dans le jardin. Il est très rapide et il aime jouer avec la balle."],
    ["Je vais à l'école. J'aime jouer avec mes amis."],
    ["Nous allons au parc. Il y a beaucoup d’arbres et nous allons jouer au football."],
    ["Le ciel devient sombre, la nuit tombe. Il est l'heure de regarder un film à la télé."]
]
infer(seq_fr, model, device)

[["it 's raining today , so we stay home ."],
 ['i speak english a lot better than you do .'],
 ['we ate pancakes for breakfast .'],
 ["the dog runs in the garden . he 's very fast and he likes to play with the ball ."],
 ['i go to school . i like to play with my friends .'],
 ['we go to the park . there are a lot of trees and we are going to play soccer .'],
 ["the sky is getting dark , the night is falling . it 's time to watch a movie on tv ."]]