In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
        # It get the src and tgt sentence of one line

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
        # this generate  [1,2,3,4] tokens generated

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            # only the rows will be 0 that has the padding token [pad] 
            
             #        i*64    [PAD]*64    a*64    cat*64
            # i*64    [0.2  0.3  0.1  0.4]
            # am*64   [0.1  0.4  0.2  0.3]
            # [Pad]*64    [0.1  0.2  0.3  0.4]
            # cat*64  [0.3  0.2  0.1  0.4]
            
            # so the mask[1,1,0,1] will give the 0 to the third row  
            
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            
            #        i*64    [PAD]*64    a*64    cat*64
            # i*64    [0.2  0.3  0.1  0.4]
            # am*64   [0.1  0.4  0.2  0.3]
            # [Pad]*64    [0.1  0.2  0.3  0.4]
            # cat*64  [0.3  0.2  0.1  0.4]
            
            # so the mask[1,1,0,1] will give the 0 to the third row 
            
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }
    
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

 "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), 

 
 It tell where there is token that is not a pad token add 1
 It tell where there is token that is equal a pad token add 0
 so the encoder mask will be 

 [1,1,1,1,1,1,0,0,0,0]
 

![](PicturesPart2/Billingual.PNG)

In [None]:
# Excellent question! The BilingualDataset class we've been discussing is a custom Dataset 
# class that's designed to work with PyTorch's DataLoader. Let's clarify how these 
# two components fit together.

# 1. Dataset (Like BilingualDataset):

# Purpose: The Dataset class is responsible for accessing and preparing 
# individual data samples. It's like an interface to your data source 
# (e.g., a list of sentences, a file, a database).

# Key Methods:

# __init__(self, ...): This is the constructor. It initializes the
# dataset, usually by loading data from a file or other source. In
# our case, it receives the raw dataset, tokenizers, language codes, and sequence length.

# __len__(self): This method returns the total number of samples
# in the dataset. The DataLoader uses this to know how many batches to create.

# __getitem__(self, idx): This is the most important method.
# Given an index idx, it returns the idx-th data sample as a dictionary.
# In our case, it returns a dictionary containing:

# encoder_input

# decoder_input

# encoder_mask

# decoder_mask

# label

# src_text

# tgt_text

# 2. DataLoader:

# Purpose: The DataLoader is responsible for batching, shuffling,
# and parallelizing the data loading process. It takes a Dataset object
# as input and provides an iterable that yields batches of data.

# How it Works:

# The DataLoader receives a Dataset object (e.g., our BilingualDataset).

# It uses the __len__ method of the Dataset to determine the size of the dataset.

# It uses the __getitem__ method of the Dataset to fetch individual samples.

# It groups these samples into batches. The batch_size argument
# to the DataLoader controls how many samples are in each batch.

# It can shuffle the data (if shuffle=True is passed to the DataLoader).
# This is important to prevent the model from learning spurious correlations due to the order of the data.

# It can load data in parallel using multiple worker processes (controlled by the num_workers argument). This can significantly speed up data loading, especially for large datasets.

# Example of using DataLoader:

from torch.utils.data import DataLoader

# Assuming you have a 'raw_dataset' (e.g., a list of translation pairs)
# and tokenizer_src, tokenizer_tgt defined.

# 1. Create the Dataset
bilingual_dataset = BilingualDataset(
    raw_dataset,  # Your raw data
    tokenizer_src,
    tokenizer_tgt,
    src_lang="en",
    tgt_lang="fr",
    seq_len=40  # Example sequence length
)

# 2. Create the DataLoader
data_loader = DataLoader(
    bilingual_dataset,
    batch_size=32,  # Process 32 samples at a time
    shuffle=True,  # Shuffle the data each epoch
    num_workers=4   # Use 4 worker processes for parallel loading
)

# 3. Iterate through the DataLoader during training:
for batch in data_loader:
    encoder_input = batch["encoder_input"]  # Shape: (batch_size, seq_len)
    decoder_input = batch["decoder_input"]  # Shape: (batch_size, seq_len)
    encoder_mask = batch["encoder_mask"]    # Shape: (batch_size, 1, 1, seq_len)
    decoder_mask = batch["decoder_mask"]    # Shape: (batch_size, 1, seq_len, seq_len)
    label = batch["label"]                # Shape: (batch_size, seq_len)

    # Move data to the GPU if available
    encoder_input = encoder_input.to(device)
    decoder_input = decoder_input.to(device)
    encoder_mask = encoder_mask.to(device)
    decoder_mask = decoder_mask.to(device)
    label = label.to(device)

    # Now you can feed these tensors to your Transformer model
#     # ... your model forward pass here ...
# content_copy
# download
# Use code with caution.
# Python

# Key Relationship:

# The DataLoader uses the BilingualDataset to get data. The DataLoader
# doesn't know how the data is prepared; it just calls the __getitem__ 
# method of the BilingualDataset and handles the batching, shuffling, and parallel loading.

# In Summary:

# The BilingualDataset is responsible for accessing and preparing individual data samples.

# The DataLoader is responsible for organizing these samples into 
# batches and providing them to the training loop efficiently.

# The DataLoader provides the batched data in a format suitable for
# the Transformer model. The encoder_input, decoder_input, encoder_mask, 
# and decoder_mask tensors that the DataLoader yields are the input to the Transformer.

# Now that we've clarified the role of the DataLoader, are you ready to 
# move on to the Transformer model architecture itself (embedding, positional 
# encoding, attention, etc.)?

In [None]:
 for idx, sample in enumerate(samples):
        print(f"Sample {idx + 1}:")
        print(f"ID: {sample['id']}")
        print(f"Source: {sample['translation']['en']}")  # Assuming 'translation' is a dict with a 'en' key for source
        print(f"Target: {sample['translation']['it']}")  # Assuming 'it' is the target language
        print("-" * 50)


Sample 1:
- **ID**: 0
- **Source**: Source: Project Gutenberg
- **Target**: Source: [www.liberliber.it/Audiobook available here](www.liberliber.it/Audiobook available here)

---

Sample 2:
- **ID**: 1
- **Source**: Jane Eyre
- **Target**: Jane Eyre

---

Sample 3:
- **ID**: 2
- **Source**: Charlotte Bronte
- **Target**: Charlotte Brontë

---

Sample 4:
- **ID**: 3
- **Source**: CHAPTER I
- **Target**: PARTE PRIMA

---

Sample 5:
- **ID**: 4
- **Source**: There was no possibility of taking a walk that day.
- **Target**: I. In quel giorno era impossibile passeggiare.


### Max_Sequence_Lenght

![](PicturesPart2/Max_length_Size.PNG)