<a href="https://colab.research.google.com/github/QaziSaim/Transformer-Text-Generation/blob/main/Biggest_Transformer_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
%pip install kagglehub

In [2]:
from google.colab import userdata
import os
os.environ['KAGGLE_API_KEY'] = userdata.get('kaggle_api_key')
os.environ['KAGGLE_USERNAME'] = userdata.get('kaggle_username')

### Downloading The Dataset

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dhruvildave/en-fr-translation-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/dhruvildave/en-fr-translation-dataset?dataset_version_number=2...


100%|██████████| 2.54G/2.54G [00:13<00:00, 200MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/dhruvildave/en-fr-translation-dataset/versions/2


In [4]:
%ls /root/.cache/kagglehub/datasets/dhruvildave/en-fr-translation-dataset/versions/2
# %ls /kaggle/input/en-fr-translation-dataset/en-fr.csv

en-fr.csv


In [5]:
import pandas as pd
# path = '/kaggle/input/en-fr-translation-dataset/en-fr.csv'
path = '/root/.cache/kagglehub/datasets/dhruvildave/en-fr-translation-dataset/versions/2/en-fr.csv'
df = pd.read_csv('/root/.cache/kagglehub/datasets/dhruvildave/en-fr-translation-dataset/versions/2/en-fr.csv')
# df = pd.read_csv('/kaggle/input/en-fr-translation-dataset/en-fr.csv')

In [6]:
df.dropna(inplace=True)

In [7]:
# df.drop_duplicates(inplace=True)

In [8]:
# en_file = open('english.txt','w',encoding='utf-8')
# fr_file = open('french.txt','w',encoding='utf-8')

# for chunk in pd.read_csv(path, chunksize=10000):
#   for en, fr in zip(chunk['en'],chunk['fr']):
#     if isinstance(en,str) and isinstance(fr,str):
#       en_file.write(en.strip() + "\n")
#       fr_file.write(fr.strip() + "\n")
# en_file.close()
# fr_file.close()

In [9]:
%%capture
%pip install sentencepiece


### Not Good takes lots of time

In [10]:
# import sentencepiece as spm
# spm.SentencePieceTrainer.Train(
#     input='english.txt,french.txt',
#     model_prefix = 'enfr',
#     vocab_size=16000,
#     character_coverage=1.0,
#     model_type='bpe'
# )

In [11]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace

In [12]:
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

In [13]:
trainer = trainers.BpeTrainer(vocab_size=32000,
                              min_frequency=2,
                              special_tokens=['<pad>','<unk>','<s>','</s>'])

In [14]:
def batch_iterator():
    for en, fr in zip(df["en"].tolist(), df["fr"].tolist()):
        yield en
        yield fr

In [15]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)


In [16]:
tokenizer.save('bpe_tokenizer.json')

In [17]:
import os
import torch
import torch.nn as nn


In [18]:
from torch.utils.data import DataLoader,Dataset

In [19]:
class TranslationDataset(Dataset):
  def __init__(self, df, tokenizer, max_len = 60):
    self.df = df
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.pad_id = tokenizer.token_to_id('<pad>')
    self.bos_id = tokenizer.token_to_id('<s>')
    self.eos_id = tokenizer.token_to_id('</s>')

  def encode(self, text):
    ids = [self.bos_id] + self.tokenizer.encode(text).ids + [self.eos_id]
    if len(ids) < self.max_len:
      ids += [self.pad_id] * (self.max_len - len(ids))

    return torch.tensor(ids[:self.max_len])

  def __len__(self):
    return len(self.df)


  def __getitem__(self, idx):
    en = self.df.iloc[idx]['en']
    fr = self.df.iloc[idx]['fr']
    src = self.encode(en)
    tgt = self.encode(fr)

    return src, tgt



In [37]:
dataset = TranslationDataset(df.sample(500000),tokenizer)

In [38]:
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [39]:
class TransformerTranslator(nn.Module):

  def __init__(self, vocab_size, d_model=256,nhead=8, num_layers=4, dim_dff = 512):
    super().__init__()
    self.src_emb = nn.Embedding(vocab_size,d_model)
    self.tgt_emb = nn.Embedding(vocab_size,d_model)
    self.pos_emb = nn.Embedding(5000, d_model)

    self.transformer = nn.Transformer(
        d_model=d_model, nhead=nhead,
        num_encoder_layers=num_layers,
        num_decoder_layers=num_layers,
        dim_feedforward=dim_dff,
        batch_first = True

    )

    self.fc_out = nn.Linear(d_model, vocab_size)
    self.d_model = d_model

  def forward(self, src, tgt):
    pad_id = 0
    src_mask = (src == pad_id)
    tgt_mask = (tgt == pad_id)
    src_positions = torch.arange(0, src.size(1), device=src.device).unsqueeze(0)
    tgt_positions = torch.arange(0, tgt.size(1), device=tgt.device).unsqueeze(0)
    src_emb = self.src_emb(src) + self.pos_emb(src_positions)
    tgt_emb = self.tgt_emb(tgt) + self.pos_emb(tgt_positions)

    out = self.transformer(src_emb,
                           tgt_emb,
                           src_key_padding_mask = src_mask,
                           tgt_key_padding_mask = tgt_mask)
    return self.fc_out(out)


In [40]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = tokenizer.get_vocab_size()
model = TransformerTranslator(vocab_size).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id('<pad>'))

In [41]:
print('Training Started')
for epoch in range(20):
  total_loss = 0
  for src, tgt in train_loader:

    src, tgt = src.to(device), tgt.to(device)
    optimizer.zero_grad()
    output = model(src, tgt[:,:-1])
    loss = criterion(output.reshape(-1, vocab_size), tgt[:, 1:].reshape(-1))
    # print(src,src.shape,src.size(),src.ndim)
    # print(tgt[:,:-1],tgt[:,:-1].shape,tgt[:,:-1].size,tgt[:,:-1].ndim)
    # output = model(src.transpose(0, 1), tgt[:,:-1].transpose(0,1))
    # loss = criterion(output.reshape(-1, vocab_size), tgt[:, 1:].reshape(-1))
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  print(f'Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}')

Training Started
Epoch 1 - Loss: 0.3241
Epoch 2 - Loss: 0.0301
Epoch 3 - Loss: 0.0280


KeyboardInterrupt: 

In [42]:
torch.save(model.state_dict(), "transformer_en_fr.pt")
print("✅ Model trained and saved as transformer_en_fr.pt")


✅ Model trained and saved as transformer_en_fr.pt


In [43]:
import torch
from tokenizers import Tokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'


tokenizer = Tokenizer.from_file('bpe_tokenizer.json')

vocab_size = tokenizer.get_vocab_size()
model = TransformerTranslator(vocab_size)
model.load_state_dict(torch.load('transformer_en_fr.pt',map_location=device))
model.to(device)
model.eval()

TransformerTranslator(
  (src_emb): Embedding(32000, 256)
  (tgt_emb): Embedding(32000, 256)
  (pos_emb): Embedding(5000, 256)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
   

In [44]:
def translate(model, sentence, tokenizer, maxlen=60):
  model.eval()

  pad_id = tokenizer.token_to_id('<pad>')
  bos_id = tokenizer.token_to_id('<s>')
  eos_id = tokenizer.token_to_id('</s>')

  src_ids = tokenizer.encode(sentence).ids
  src = torch.tensor([ [bos_id] + src_ids + [eos_id]]).to(device)


  tgt = torch.tensor([[bos_id]],device=device)

  for _ in range(maxlen):
    with torch.no_grad():

      out = model(src,tgt)
      next_token = out[:, -1:]
      next_token = torch.argmax(next_token,dim=-1).item()

    tgt = torch.cat([tgt, torch.tensor([[next_token]],device=device)],dim=1)

    if next_token == eos_id:
      break
  pred_tokens = tgt[0].tolist()[1:-1]
  translation = tokenizer.decode(pred_tokens)
  return translation



In [45]:
english_sentence = "I love learning new languages."
french_translation = translate(model, english_sentence, tokenizer)
print(f"English: {english_sentence}")
print(f"French: {french_translation}")


  output = torch._nested_tensor_from_mask(


English: I love learning new languages.
French: directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices directrices secteurs secteurs secteurs secteurs secteurs directrices directrices secteurs directrices directrices secteurs directrices directrices secteurs directrices directrices directrices directrices secteurs directrices directrices secteurs secteurs secteurs secteurs secteurs directrices secteurs secteurs secteurs secteurs secteurs directrices secteurs -


In [46]:
import torchsummary


TypeError: 'module' object is not callable