In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
from collections import Counter

# Load spacy models for English and French (or another language)
spacy_en = spacy.load("en_core_web_sm")  # Load English tokenizer
spacy_fr = spacy.load("fr_core_news_sm")  # Load French tokenizer

# Tokenizers
def tokenize_en(text):
    """
    Tokenize English text using spacy.
    Args:
        text (str): Input English sentence.
    Returns:
        list: List of tokens.
    """
    return [token.text for token in spacy_en.tokenizer(text)]  # Tokenize and extract text

def tokenize_fr(text):
    """
    Tokenize French text using spacy.
    Args:
        text (str): Input French sentence.
    Returns:
        list: List of tokens.
    """
    return [token.text for token in spacy_fr.tokenizer(text)]  # Tokenize and extract text

# Vocabulary class
class Vocabulary:
    """
    Vocabulary class to handle word-to-index and index-to-word mappings.
    """
    def __init__(self, freq_threshold=2):
        """
        Initialize the vocabulary.
        Args:
            freq_threshold (int): Minimum frequency for a word to be included in the vocabulary.
        """
        self.freq_threshold = freq_threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}  # Index to string mapping
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}  # String to index mapping
        self.counter = Counter()  # Counter to keep track of word frequencies

    def __len__(self):
        """
        Returns the size of the vocabulary.
        """
        return len(self.itos)

    def build_vocab(self, sentence_list):
        """
        Build the vocabulary from a list of sentences.
        Args:
            sentence_list (list): List of tokenized sentences.
        """
        for sentence in sentence_list:
            self.counter.update(sentence)  # Update word frequencies

        for word, count in self.counter.items():
            if count >= self.freq_threshold:  # Include words above the frequency threshold
                self.stoi[word] = len(self.itos)  # Add word to string-to-index mapping
                self.itos[len(self.itos)] = word  # Add word to index-to-string mapping

    def numericalize(self, text):
        """
        Convert a list of tokens to a list of indices.
        Args:
            text (list): List of tokens.
        Returns:
            list: List of indices.
        """
        return [self.stoi[word] if word in self.stoi else self.stoi["<UNK>"] for word in text]  # Convert tokens to indices

# Dataset class
class TranslationDataset(Dataset):
    """
    Dataset class for translation tasks.
    """
    def __init__(self, en_texts, fr_texts, en_vocab, fr_vocab, seq_length):
        """
        Initialize the dataset.
        Args:
            en_texts (list): List of English sentences.
            fr_texts (list): List of French sentences.
            en_vocab (Vocabulary): English vocabulary.
            fr_vocab (Vocabulary): French vocabulary.
            seq_length (int): Maximum sequence length.
        """
        self.en_texts = en_texts
        self.fr_texts = fr_texts
        self.en_vocab = en_vocab
        self.fr_vocab = fr_vocab
        self.seq_length = seq_length

    def __len__(self):
        """
        Returns the number of samples in the dataset.
        """
        return len(self.en_texts)

    def __getitem__(self, idx):
        """
        Returns a sample from the dataset.
        Args:
            idx (int): Index of the sample.
        Returns:
            tuple: (source sequence, target sequence) as tensors.
        """
        en_text = self.en_texts[idx]  # Get English sentence
        fr_text = self.fr_texts[idx]  # Get French sentence

        # Tokenize and add special tokens
        en_tokens = ["<SOS>"] + tokenize_en(en_text) + ["<EOS>"]  # Add <SOS> and <EOS> tokens
        fr_tokens = ["<SOS>"] + tokenize_fr(fr_text) + ["<EOS>"]  # Add <SOS> and <EOS> tokens

        # Convert tokens to indices
        en_indices = self.en_vocab.numericalize(en_tokens)  # Convert English tokens to indices
        fr_indices = self.fr_vocab.numericalize(fr_tokens)  # Convert French tokens to indices

        # Pad sequences to the specified length
        en_indices = self.pad_sequence(en_indices, self.seq_length)  # Pad English sequence
        fr_indices = self.pad_sequence(fr_indices, self.seq_length)  # Pad French sequence

        return torch.tensor(en_indices, dtype=torch.long), torch.tensor(fr_indices, dtype=torch.long)  # Convert to tensors

    def pad_sequence(self, sequence, max_len):
        """
        Pad or truncate a sequence to the specified length.
        Args:
            sequence (list): List of indices.
            max_len (int): Maximum sequence length.
        Returns:
            list: Padded or truncated sequence.
        """
        if len(sequence) < max_len:
            sequence = sequence + [0] * (max_len - len(sequence))  # Pad with zeros
        else:
            sequence = sequence[:max_len]  # Truncate if too long
        return sequence

# Transformer model (same as before)
class Transformer(nn.Module):
    """
    Transformer model for sequence-to-sequence tasks.
    """
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        """
        Initialize the Transformer model.
        Args:
            src_vocab_size (int): Size of the source vocabulary.
            tgt_vocab_size (int): Size of the target vocabulary.
            d_model (int): Dimension of the model.
            num_heads (int): Number of attention heads.
            num_layers (int): Number of encoder/decoder layers.
            d_ff (int): Dimension of the feed-forward network.
            max_seq_length (int): Maximum sequence length.
            dropout (float): Dropout rate.
        """
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)  # Embedding for source tokens
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)  # Embedding for target tokens
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_length, d_model))  # Positional encoding
        
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])  # Encoder layers
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])  # Decoder layers
        
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)  # Final linear layer
        self.dropout = nn.Dropout(dropout)  # Dropout layer
        
    def generate_mask(self, src, tgt):
        """
        Generate masks for source and target sequences.
        Args:
            src (torch.Tensor): Source sequence.
            tgt (torch.Tensor): Target sequence.
        Returns:
            tuple: (source mask, target mask)
        """
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # Mask for padding tokens in source
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)  # Mask for padding tokens in target
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()  # Mask for future tokens
        tgt_mask = tgt_mask & nopeak_mask  # Combine padding and future masks
        return src_mask, tgt_mask
        
    def forward(self, src, tgt):
        """
        Forward pass of the Transformer model.
        Args:
            src (torch.Tensor): Source sequence.
            tgt (torch.Tensor): Target sequence.
        Returns:
            torch.Tensor: Model output.
        """
        src_mask, tgt_mask = self.generate_mask(src, tgt)  # Generate masks
        
        src_embedded = self.dropout(self.encoder_embedding(src) + self.positional_encoding[:, :src.size(1), :])  # Embed source tokens
        tgt_embedded = self.dropout(self.decoder_embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :])  # Embed target tokens
        
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)  # Pass through encoder layers
            
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)  # Pass through decoder layers
            
        output = self.fc_out(dec_output)  # Final linear layer
        return output

# Hyperparameters
vocab_size = 5000  # Vocabulary size
seq_length = 20  # Maximum sequence length
batch_size = 32  # Batch size
d_model = 512  # Model dimension
num_heads = 8  # Number of attention heads
num_layers = 6  # Number of encoder/decoder layers
d_ff = 2048  # Feed-forward dimension
dropout = 0.1  # Dropout rate
num_epochs = 10  # Number of epochs
learning_rate = 0.0001  # Learning rate

# Sample data (replace with your dataset)
en_texts = ["I love programming.", "This is a test.", "How are you?"]  # English sentences
fr_texts = ["J'adore la programmation.", "C'est un test.", "Comment ça va?"]  # French sentences

# Build vocabularies
en_vocab = Vocabulary()  # English vocabulary
fr_vocab = Vocabulary()  # French vocabulary
en_vocab.build_vocab([tokenize_en(text) for text in en_texts])  # Build English vocabulary
fr_vocab.build_vocab([tokenize_fr(text) for text in fr_texts])  # Build French vocabulary

# Create dataset and data loader
dataset = TranslationDataset(en_texts, fr_texts, en_vocab, fr_vocab, seq_length)  # Create dataset
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)  # Create data loader

# Initialize the Transformer model
transformer = Transformer(len(en_vocab), len(fr_vocab), d_model, num_heads, num_layers, d_ff, seq_length, dropout)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Loss function (ignore padding tokens)
optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate)  # Optimizer

# Training loop
for epoch in range(num_epochs):
    transformer.train()  # Set model to training mode
    total_loss = 0

    for batch_idx, (src, tgt) in enumerate(data_loader):
        # Forward pass
        output = transformer(src, tgt[:, :-1])  # Exclude the last token in the target sequence
        loss = criterion(output.reshape(-1, len(fr_vocab)), tgt[:, 1:].reshape(-1))  # Compute loss

        # Backward pass and optimization
        optimizer.zero_grad()  # Clear gradients
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        total_loss += loss.item()  # Accumulate loss

        if (batch_idx + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(data_loader)}], Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(data_loader)  # Compute average loss
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}")

torch.Size([32, 100, 5000])


In [3]:
!pip install torch spacy
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 13.4 MB/s eta 0:00:01
     ------------------- -------------------- 6.3/12.8 MB 16.1 MB/s eta 0:00:01
     ------------------------------- ------- 10.5/12.8 MB 16.4 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 16.8 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 13.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.