In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, d_model))
        self.generator = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.src_tok_emb(src) + self.positional_encoding[:,:src.size(1),:]
        tgt_emb = self.tgt_tok_emb(tgt) + self.positional_encoding[:,:tgt.size(1),:]
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

# Example of model instantiation
# Assume src_vocab_size and tgt_vocab_size are predefined with the sizes of the English and German vocabularies
src_vocab_size = 10000  # Example vocab size
tgt_vocab_size = 10000  # Example vocab size
model = TransformerModel(src_vocab_size, tgt_vocab_size)

# Move model to GPU if available
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
model.to(device)


TransformerModel(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerDecoderLayer(
          (self_attn): MultiheadAttent

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

def train(model, data_loader, optimizer, criterion, num_epochs=10):
    model.train()  # Set the model to training mode
    for epoch in range(num_epochs):
        total_loss = 0
        for src, tgt in data_loader:
            src = src.to(device)
            tgt_input = tgt[:, :-1].to(device)
            tgt_output = tgt[:, 1:].to(device)

            # Create masks and padding masks for the src and target
            src_mask, tgt_mask = generate_square_subsequent_mask(src.size(1)).to(device), generate_square_subsequent_mask(tgt_input.size(1)).to(device)
            src_padding_mask = (src == 0).transpose(0, 1).to(device)
            tgt_padding_mask = (tgt_input == 0).transpose(0, 1).to(device)
            memory_key_padding_mask = src_padding_mask.clone()

            optimizer.zero_grad()  # Clear previous gradients
            output = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
            loss = criterion(output.view(-1, output.size(-1)), tgt_output.reshape(-1))
            loss.backward()  # Backpropagate the loss
            optimizer.step()  # Update the model parameters
            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss/len(data_loader)}')

# Example usage:
batch_size = 64
src_vocab_size = 10000  # Example vocab size
tgt_vocab_size = 10000  # Example vocab size
model = TransformerModel(src_vocab_size, tgt_vocab_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming '0' is the padding index

# Dummy data for demonstration: actual data should be tensors of token indices
# These should be replaced with actual data loading part
src_data = torch.randint(1, src_vocab_size, (1000, 35))  # (batch_size, sequence_length)
tgt_data = torch.randint(1, tgt_vocab_size, (1000, 36))  # (batch_size, sequence_length)
dataset = TensorDataset(src_data, tgt_data)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

num_epochs = 10  # Number of epochs to train
train(model, data_loader, optimizer, criterion, num_epochs)


  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: The shape of the 2D attn_mask is torch.Size([35, 35]), but should be (64, 64).

In [4]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import random
from collections import Counter
from itertools import chain

# Assuming you have a file with parallel sentences
parallel_data = ["il est en train de peindre un tableau . = he is painting a picture", 
                 "elle lit un livre . = she is reading a book"]  # Extend this list with your data

class Vocabulary:
    def __init__(self, freq_threshold=2):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {v: k for k, v in self.itos.items()}
        self.freq_threshold = freq_threshold

    def build_vocabulary(self, sentence_list):
        frequencies = Counter(chain(*sentence_list))
        self.itos.update({word: idx for idx, (word, freq) in enumerate(frequencies.items(), len(self.itos)) if freq >= self.freq_threshold})
        self.stoi = {v: k for k, v in self.itos.items()}

    def numericalize(self, text):
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in text]

def tokenize_english(text):
    return [tok.lower() for tok in text.split()]

def tokenize_french(text):
    return [tok.lower() for tok in text.split()]

class ParallelDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src_sentence, tgt_sentence = self.data[idx].split(" = ")
        src = [self.src_vocab.stoi["<SOS>"]] + self.src_vocab.numericalize(tokenize_french(src_sentence)) + [self.src_vocab.stoi["<EOS>"]]
        tgt = [self.tgt_vocab.stoi["<SOS>"]] + self.tgt_vocab.numericalize(tokenize_english(tgt_sentence)) + [self.tgt_vocab.stoi["<EOS>"]]
        return torch.tensor(src), torch.tensor(tgt)

# Build vocabularies
src_vocab = Vocabulary()
tgt_vocab = Vocabulary()
src_vocab.build_vocabulary([tokenize_french(sentence.split(" = ")[0]) for sentence in parallel_data])
tgt_vocab.build_vocabulary([tokenize_english(sentence.split(" = ")[1]) for sentence in parallel_data])

# Create dataset
dataset = ParallelDataset(parallel_data, src_vocab, tgt_vocab)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# For demonstration, let's print one batch
for src, tgt in data_loader:
    print("Source Batch:", src)
    print("Target Batch:", tgt)
    break



RuntimeError: stack expects each tensor to be equal size, but got [11] at entry 0 and [7] at entry 1