In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

df = pd.read_csv("/kaggle/input/high-quality-multilingual-translation-data/en-fr_train.csv")
df['translation'] = df['translation'].apply(eval)
en_sentences = df['translation'].apply(lambda x: x['en']).dropna().astype(str).tolist()
fr_sentences = df['translation'].apply(lambda x: x['fr']).dropna().astype(str).tolist()

with open("en.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(en_sentences))
with open("fr.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(fr_sentences))

def train_tokenizer(file, name):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=8000, special_tokens=["<pad>", "<s>", "</s>", "<unk>"])
    tokenizer.train([file], trainer)
    tokenizer.post_processor = processors.TemplateProcessing(
        single="<s> $A </s>",
        special_tokens=[("<s>", tokenizer.token_to_id("<s>")), ("</s>", tokenizer.token_to_id("</s>"))]
    )
    tokenizer.save(f"{name}_tokenizer.json")
    return tokenizer

en_tok = train_tokenizer("en.txt", "en")
fr_tok = train_tokenizer("fr.txt", "fr")

class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_tok, tgt_tok, max_len=64):
        self.src = src_texts
        self.tgt = tgt_texts
        self.src_tok = src_tok
        self.tgt_tok = tgt_tok
        self.max_len = max_len

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        src_ids = self.src_tok.encode(self.src[idx]).ids[:self.max_len]
        tgt_ids = self.tgt_tok.encode(self.tgt[idx]).ids[:self.max_len]
        src_ids += [self.src_tok.token_to_id("<pad>")] * (self.max_len - len(src_ids))
        tgt_ids += [self.tgt_tok.token_to_id("<pad>")] * (self.max_len - len(tgt_ids))
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

dataset = TranslationDataset(en_sentences, fr_sentences, en_tok, fr_tok)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

class TransformerModel(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=256, nhead=4, num_layers=3, dropout=0.1):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dropout=dropout)
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def forward(self, src, tgt):
        src = self.src_embed(src).permute(1, 0, 2)
        tgt = self.tgt_embed(tgt).permute(1, 0, 2)
        output = self.transformer(src, tgt)
        return self.fc_out(output).permute(1, 0, 2)

model = TransformerModel(src_vocab=en_tok.get_vocab_size(), tgt_vocab=fr_tok.get_vocab_size()).cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=en_tok.token_to_id("<pad>"), label_smoothing=0.1)

for epoch in range(5):
    model.train()
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.cuda(), tgt.cuda()
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        logits = model(src, tgt_input)
        loss = loss_fn(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completed. Loss: {total_loss:.4f}")

torch.save(model.state_dict(), "/kaggle/working/transformer_en_fr.pth")

def translate(sentence, max_len=64):
    model.eval()
    src_ids = en_tok.encode(sentence).ids[:max_len]
    src_ids += [en_tok.token_to_id("<pad>")] * (max_len - len(src_ids))
    src_tensor = torch.tensor(src_ids).unsqueeze(0).cuda()

    tgt_start = torch.tensor([fr_tok.token_to_id("<s>")]).unsqueeze(0).cuda()
    translated = tgt_start

    for _ in range(max_len - 1):
        output = model(src_tensor, translated)
        logits = output[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        translated = torch.cat([translated, next_token], dim=1)
        if next_token.item() == fr_tok.token_to_id("</s>"):
            break

    return fr_tok.decode(translated.squeeze().tolist(), skip_special_tokens=True)

print(translate("Hello I am french and I like baguettes"))











Epoch 1 completed. Loss: 22117.7016
Epoch 2 completed. Loss: 18716.6067
Epoch 3 completed. Loss: 17081.8680
Epoch 4 completed. Loss: 16006.9929
Epoch 5 completed. Loss: 15225.5646
chances ..., m irré m a questions m m m … oix m m m m m m m m m vir m m m m m m m m m m m m m m m m m m m m m m m m h m m m m m m m m m m m m m m m m


In [8]:
import pandas as pd

df = pd.read_csv("/kaggle/input/high-quality-multilingual-translation-data/en-fr_train.csv")
print(df.columns)
print(df.head())

Index(['id', 'translation'], dtype='object')
   id                                        translation
0   0  {'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'}
1   1   {'en': 'Alain-Fournier', 'fr': 'Alain-Fournier'}
2   2      {'en': 'First Part', 'fr': 'PREMIÈRE PARTIE'}
3   3              {'en': 'I', 'fr': 'CHAPITRE PREMIER'}
4   4     {'en': 'THE BOARDER', 'fr': 'LE PENSIONNAIRE'}
