In [None]:
import torch
torch.cuda.is_available()

In [8]:
import json
import os
import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from torch.utils.data import Dataset, DataLoader
warnings.simplefilter("ignore")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model

        pe = torch.zeros(max_seq_len, self.d_model)
        for pos in range(max_seq_len):
            for i in range(0,self.d_model,2):
                pe[pos, i] = math.sin(pos / (10000 ** (i/self.d_model)))
                if i+1 < self.d_model:
                    pe[pos, i + 1] = math.cos(pos / (10000 ** (i/self.d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe) #dont use for training


    def forward(self, x):
        seq_len = x.size(1)
        x = x + torch.Tensor(self.pe[:,:seq_len])
        return x

In [None]:
#d_model = 512, num_heads = 8
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        #We use weights of size (d_model, d_model) to represent the weights of all heads, the first (d_model, d_k) would be the weights of 1st head and so on.
        self.W_q = nn.Linear(d_model, d_model) #weights for Queries
        self.W_k = nn.Linear(d_model, d_model) #weights for Keys
        self.W_v = nn.Linear(d_model, d_model) #weights for Values
        self.W_o = nn.Linear(d_model, d_model) #weights for Outputs
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None): #Q,K,V are in the shape [batch_size, num_heads, seq_length, d_k]
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)  #dot product
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)  #masking so softmax assigns them 0 probability
        attn_probs = torch.softmax(attn_scores, dim=-1)  #softmax
        output = torch.matmul(attn_probs, V) #weighted distribution of Values using softmax
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)  #splits into d_k length vectors for multihead
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model) #reverses what split_heads does to return a d_model vector output
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q)) #split_heads is used here to represent the multiple heads 
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask) #Gets attention
        output = self.W_o(self.combine_heads(attn_output)) #Linear output layer
        return output

In [None]:
#two linear transformations and a ReLU activation
#d_ff = 2048
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask) #get self attention
        x = self.norm1(x + self.dropout(attn_output)) #residual add and norm
        ff_output = self.feed_forward(x) #feed forward
        x = self.norm2(x + self.dropout(ff_output)) #residual add and norm
        return x #encoder output

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask): #masks are used so the decoder layer can't look further than the words it has predicted
        attn_output = self.self_attn(x, x, x, tgt_mask) #masked self attention
        x = self.norm1(x + self.dropout(attn_output)) #residual add and norm
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask) #masked cross attention
        x = self.norm2(x + self.dropout(attn_output)) #residual add and norm
        ff_output = self.feed_forward(x) #feed forward
        x = self.norm3(x + self.dropout(ff_output)) #residual add and norm
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) #word embedding for source language
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model) #word embedding for target language
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length) #positional encoding

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) #encoder layers (6)
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) #decoder layers (6)

        self.fc = nn.Linear(d_model, tgt_vocab_size) #linear layer for decoding output
        self.dropout = nn.Dropout(dropout) #dropout after positional encoding

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2) #creates a boolean mask of shape (batch_size, 1, 1, seq_length)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3) #makes it ignore padding tokens
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device) #makes it not look ahead
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src))) #word and positional embedding
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded  #run through num_layers of encoders
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded  #run through num_layers of decoders
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output) #final linear layer
        tgt_word_probs = torch.softmax(output, dim=-1)  #softmax
        return tgt_word_probs #final decoded output

In [4]:
def create_tokenizer(file_path, lang, vocab_size=20000):
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"]
    )
    def batch_iterator():
        batch_size = 1000
        for chunk in pd.read_csv(file_path, chunksize=batch_size, on_bad_lines="skip", encoding="utf-8", lineterminator='\n'):  # Read in chunks
            chunk = chunk.dropna()
            yield chunk[lang].tolist()
    
    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
    return tokenizer

In [9]:
def get_or_create_tokenizers(filepath, base_path="tokenizers"):
    os.makedirs(base_path, exist_ok=True)

    src_path = os.path.join(base_path, "src_tokenizer.json")
    tgt_path = os.path.join(base_path, "tgt_tokenizer.json")
    
    if os.path.exists(src_path):
        print(f"Loading existing source tokenizer from {src_path}")
        src_tokenizer = Tokenizer.from_file(src_path)
    else:
        print(f"Creating new source tokenizer and saving to {src_path}")
        src_tokenizer = create_tokenizer(filepath, "de")
        src_tokenizer.save(src_path)
    
    if os.path.exists(tgt_path):
        print(f"Loading existing target tokenizer from {tgt_path}")
        tgt_tokenizer = Tokenizer.from_file(tgt_path)
    else:
        print(f"Creating new target tokenizer and saving to {tgt_path}")
        tgt_tokenizer = create_tokenizer(filepath, "en")
        tgt_tokenizer.save(tgt_path)
    
    return src_tokenizer, tgt_tokenizer

In [10]:
src_tokenizer, tgt_tokenizer = get_or_create_tokenizers("wmt14_translate_de-en_train.csv")

Creating new source tokenizer and saving to tokenizers\src_tokenizer.json
Creating new target tokenizer and saving to tokenizers\tgt_tokenizer.json


In [None]:
df = pd.read_csv("wmt14_translate_de-en_train.csv", nrows=10, encoding="utf-8")
print(df.head())

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data, src_tokenizer, tgt_tokenizer, max_length=64):
        self.data = data
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Tokenize source and target texts
        src_tokens = self.src_tokenizer.encode(item['en'])
        tgt_tokens = self.tgt_tokenizer.encode(item['de'])
        
        # Add BOS and EOS tokens
        src_tokens = [self.src_tokenizer.token_to_id("[BOS]")] + src_tokens.ids + [self.src_tokenizer.token_to_id("[EOS]")]
        tgt_tokens = [self.tgt_tokenizer.token_to_id("[BOS]")] + tgt_tokens.ids + [self.tgt_tokenizer.token_to_id("[EOS]")]
        
        # Pad sequences
        src_tokens = self._pad_sequence(src_tokens)
        tgt_tokens = self._pad_sequence(tgt_tokens)
        
        return torch.tensor(src_tokens), torch.tensor(tgt_tokens)

    def _pad_sequence(self, tokens):
        pad_id = self.src_tokenizer.token_to_id("[PAD]")
        if len(tokens) < self.max_length:
            tokens = tokens + [pad_id] * (self.max_length - len(tokens))
        else:
            tokens = tokens[:self.max_length-1] + [self.src_tokenizer.token_to_id("[EOS]")]
        return tokens

In [None]:
class LanguageDataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.file_path = file_path
        self.transform = transform

    def __getitem__(self, idx):
        chunk = pd.read_csv(self.file_path, skiprows=idx, nrows=1)  # Load only 1 row
        data = chunk.values[0]  # Convert row to NumPy
        if self.transform:
            data = self.transform(data)
        return data

    def __len__(self):
        return sum(1 for _ in open(self.file_path)) - 1 

In [2]:
class TranslationDataset(Dataset):
    def __init__(self, data, src_tokenizer, tgt_tokenizer, max_length=64):
        self.data = data
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Tokenize source and target texts
        src_tokens = self.src_tokenizer.encode(item['translation']['en'])
        tgt_tokens = self.tgt_tokenizer.encode(item['translation']['de'])
        
        # Add BOS and EOS tokens
        src_tokens = [self.src_tokenizer.token_to_id("[BOS]")] + src_tokens.ids + [self.src_tokenizer.token_to_id("[EOS]")]
        tgt_tokens = [self.tgt_tokenizer.token_to_id("[BOS]")] + tgt_tokens.ids + [self.tgt_tokenizer.token_to_id("[EOS]")]
        
        # Pad sequences
        src_tokens = self._pad_sequence(src_tokens)
        tgt_tokens = self._pad_sequence(tgt_tokens)
        
        return torch.tensor(src_tokens), torch.tensor(tgt_tokens)

    def _pad_sequence(self, tokens):
        pad_id = self.src_tokenizer.token_to_id("[PAD]")
        if len(tokens) < self.max_length:
            tokens = tokens + [pad_id] * (self.max_length - len(tokens))
        else:
            tokens = tokens[:self.max_length-1] + [self.src_tokenizer.token_to_id("[EOS]")]
        return tokens

In [None]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
src_data = src_data.to(device)
tgt_data = tgt_data.to(device)

In [None]:
print(next(transformer.parameters()).device)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(100):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")