In [58]:
import torch 
import torch.nn as nn
import numpy as np
import pandas

In [59]:
if torch.cuda.is_available():
    device = "cuda"
    print(torch.cuda.device_count())
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

1


'cuda'

In [60]:
data = []
filename = "/kaggle/input/eng-spa/spa.txt"
with open(filename, "r") as f:
    for line in f:
        data.append(line.strip())

In [61]:
cleaned_data = [line.replace("¡", "").replace("¿", "") for line in data]
pairs = [line.split("\t") for line in cleaned_data]

In [62]:
np.random.shuffle(pairs)
eng_sentences, es_sentences = zip(*pairs)
for i in range(3):
    print(eng_sentences[i], "==>" ,es_sentences[i])

Tom might have left his umbrella in Mary's car. ==> Puede que a Tom se le haya quedado el paraguas en el auto de Mary.
We believe that Tom killed Mary with an ice pick. ==> Creemos que Tom mató a Mary con un picahielos.
I am disappointed. ==> Estoy decepcionado.


In [63]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [64]:
max_len = 100
def encode_with_gpt2(sentence, add_sos_and_eos=False):
    
    texts = [f"<s> {s} </s>" if add_sos_and_eos else s for s in sentence]
    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )
    return encodings


In [65]:
from torch.utils.data import DataLoader, Dataset

class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, tokenizer, max_len=500):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self,idx):
        src = self.src_sentences[idx]
        tgt = f"<s> {self.tgt_sentences[idx]} </s>"

        src_enc = self.tokenizer(
            src,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt")
        tgt_enc = self.tokenizer(
            tgt,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt")

        decoder_input_ids = tgt_enc["input_ids"][:,:-1].squeeze(0)
        labels = tgt_enc["input_ids"][:,1:].squeeze(0)

        return {
            "encoder_input_ids":src_enc["input_ids"].squeeze(0),
            "encoder_attention_mask":src_enc["attention_mask"].squeeze(0),
            "decoder_input_ids":decoder_input_ids,
            "labels":labels
        } 

In [66]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):
    def __init__(self, pretrained_embed, n_hidden=2, hidden_size=128,dropout=0.2):
        super().__init__()
        weights = pretrained_embed.weight.data
        self.embed = nn.Embedding.from_pretrained(weights,freeze=True)
        embed_size = weights.shape[-1]
        self.gru = nn.GRU(embed_size, hidden_size, num_layers=n_hidden,
                         batch_first=True, dropout=dropout, bidirectional=True)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embed(input_ids)
        lengths = attention_mask.sum(dim=1)
        packed = pack_padded_sequence(embeddings,
                                     lengths = lengths.cpu(),
                                     batch_first=True,
                                     enforce_sorted=False)
        outputs, hidden = self.gru(packed)
        outputs, _ = pad_packed_sequence(outputs, batch_first=True, total_length=input_ids.size(1))
        return outputs, hidden    

In [67]:
class Decoder(nn.Module):
    def __init__(self, pretrained_embed, n_hidden=2, hidden_size=128):
        super().__init__()
        weights = pretrained_embed.weight.data
        self.embed = nn.Embedding.from_pretrained(weights,freeze=True)
        embed_size = weights.shape[-1]
        self.gru = nn.GRU(embed_size, hidden_size*2,num_layers=n_hidden,
                          batch_first=True)
        self.output = nn.Linear(hidden_size*2, weights.shape[0])

    def forward(self, input_ids, hidden):
        embeddings = self.embed(input_ids)
        outputs, hidden = self.gru(embeddings, hidden)
        logits = self.output(outputs)
        return logits, hidden       

In [68]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()   
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src_ids, src_mask, tgt_ids):
        enc_outputs, enc_hidden = self.encoder(src_ids, src_mask)
        logits, _ = self.decoder(tgt_ids, enc_hidden)
        return logits

In [69]:
import transformers

gpt_model = transformers.AutoModel.from_pretrained("gpt2")
vocab_size = gpt_model.get_input_embeddings().weight.data.shape[0]
vocab_size

50257

In [70]:
from sklearn.model_selection import train_test_split

eng_train, eng_valid, es_train, es_valid = train_test_split(
    eng_sentences, es_sentences, test_size = 0.20
)

batch_size = 128

train_dataset = TranslationDataset(eng_train, es_train, tokenizer)
valid_dataset = TranslationDataset(eng_valid, es_valid, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

In [71]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for batch in data_loader:
            src_ids = batch["encoder_input_ids"].to(device)
            src_mask = batch["encoder_attention_mask"].to(device)
            tgt_ids = batch["decoder_input_ids"].to(device)
            labels = batch["labels"].to(device)

            y_pred = model(src_ids, src_mask, tgt_ids)
            metric.update(y_pred.view(-1,vocab_size), labels.view(-1))
    return metric.compute
            
def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs):
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        total_loss = 0
        metric.reset()
        model.train()
        for idx, batch in enumerate(train_loader):
            src_ids = batch["encoder_input_ids"].to(device)
            src_mask = batch["encoder_attention_mask"].to(device)
            tgt_ids = batch["decoder_input_ids"].to(device)
            labels = batch["labels"].to(device)

            y_pred = model(src_ids, src_mask, tgt_ids)
            loss = criterion(y_pred.view(-1,vocab_size), labels.view(-1))
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred.view(-1,vocab_size), labels.view(-1))
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric).item()
        history["valid_metrics"].append(val_metric)
        print(f"Epoch:{epoch+1}/{n_epochs}, "
             f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}%, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}%")
    return history

In [72]:
encoder = Encoder(gpt_model.get_input_embeddings())
decoder = Decoder(gpt_model.get_input_embeddings())

nmt_model = Seq2Seq(encoder, decoder).to(device)

optimizer = torch.optim.NAdam(nmt_model.parameters())
xentropy = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
accuracy = torchmetrics.Accuracy(task="multiclass",num_classes = vocab_size)
n_epochs=20

history = train(nmt_model, optimizer, xentropy, accuracy, train_loader, valid_loader, n_epochs)

RuntimeError: Expected hidden size (2, 128, 256), got [4, 128, 128]