<a href="https://colab.research.google.com/github/QaziSaim/Fine-Tune-Projects/blob/main/Pytorch_encoder_decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch==2.4.0 torchtext==0.18.0


Collecting torch==2.4.0
  Using cached torch-2.4.0-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.4.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.4.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.4.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-man

In [None]:
import re
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

# ----------------------------
# 1. Load & Clean Dataset
# ----------------------------
def load_data(path, max_pairs=50000):
    eng_texts, fra_texts = [], []
    with open(path, encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_pairs and i >= max_pairs:
                break
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            eng, fra = parts[0], parts[1]
            eng_texts.append(eng.lower())
            fra_texts.append(fra.lower())
    return eng_texts, fra_texts

eng_texts, fra_texts = load_data("/content/fra.txt", max_pairs=50000)
print("Sample:", eng_texts[0], "->", fra_texts[0])

# ----------------------------
# 2. Tokenizer
# ----------------------------
def tokenizer(text):
    return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)

eng_tokens = [tokenizer(s) for s in eng_texts]
fra_tokens = [tokenizer(s) for s in fra_texts]

# ----------------------------
# 3. Build Vocab
# ----------------------------
specials = ["<pad>", "<sos>", "<eos>", "<unk>"]

eng_vocab = build_vocab_from_iterator(eng_tokens, specials=specials, min_freq=2)
fra_vocab = build_vocab_from_iterator(fra_tokens, specials=specials, min_freq=2)

eng_vocab.set_default_index(eng_vocab["<unk>"])
fra_vocab.set_default_index(fra_vocab["<unk>"])

print("English vocab size:", len(eng_vocab))
print("French vocab size:", len(fra_vocab))

# ----------------------------
# 4. Dataset & DataLoader
# ----------------------------
class TranslationDataset(Dataset):
    def __init__(self, src, trg, src_vocab, trg_vocab):
        self.src = src
        self.trg = trg
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        src_seq = [self.src_vocab["<sos>"]] + [self.src_vocab[t] for t in self.src[idx]] + [self.src_vocab["<eos>"]]
        trg_seq = [self.trg_vocab["<sos>"]] + [self.trg_vocab[t] for t in self.trg[idx]] + [self.trg_vocab["<eos>"]]
        return torch.tensor(src_seq), torch.tensor(trg_seq)

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=eng_vocab["<pad>"], batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=fra_vocab["<pad>"], batch_first=True)
    return src_batch, trg_batch

dataset = TranslationDataset(eng_tokens, fra_tokens, eng_vocab, fra_vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# ----------------------------
# 5. Try one batch
# ----------------------------
src_batch, trg_batch = next(iter(dataloader))
print("SRC batch shape:", src_batch.shape)
print("TRG batch shape:", trg_batch.shape)
print("Example src:", src_batch[0][:10])
print("Example trg:", trg_batch[0][:10])


OSError: /usr/local/lib/python3.12/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs