In [1]:
import torch
import torch.nn as nn
import polars as pl
from collections import defaultdict

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
seq2seq_BATCH_SIZE = 64
seq2seq_EMB_DIM = 128
seq2seq_HID_DIM = 258
seq2seq_N_LAYERS = 2
seq2seq_DROPOUT = 0.5
seq2seq_LEARNING_RATE = 0.001
seq2seq_EPOCHS = 10
seq2seq_FRAC = 0.2
seq2seq_SEED = 420

In [4]:
transformer_BATCH_SIZE = 32
transformer_EMB_DIM = 128
transformer_HID_DIM = 258
transformer_N_LAYERS = 2
transformer_N_HEADS = 8
transformer_FF_DIM = 512
transformer_DROPOUT = 0.1
transformer_LEARNING_RATE = 0.001
transformer_EPOCHS = 10
transformer_FRAC = 0.5
transformer_SEED = 420

In [5]:
kaggle_path= "/kaggle/input/it-en-translation/processed.parquet"
loaded_data = pl.read_parquet(kaggle_path)
print("Data loaded successfully")
loaded_data.head()

Data loaded successfully


it,en
list[str],list[str]
"[""grazie"", ""amico""]","[""thank"", ""buddy""]"
"[""di il""]","[""say""]"
"[""trifosfare"", ""sodio"", … ""sodio""]","[""sodium"", ""triphosphate"", … ""tripolyphosphate""]"
"[""invero"", ""avidare"", … ""ricchezzo""]","[""surely"", ""ardent"", … ""wealth""]"
"[""allegare""]","[""annex""]"


In [6]:
transformer_input_data = loaded_data.sample(fraction = transformer_FRAC, seed = transformer_SEED)

transformer_input_data.describe()

statistic,it,en
str,f64,f64
"""count""",480021.0,480021.0
"""null_count""",0.0,0.0
"""mean""",,
"""std""",,
"""min""",,
"""25%""",,
"""50%""",,
"""75%""",,
"""max""",,


In [7]:
seq2seq_input_data = loaded_data.sample(fraction = seq2seq_FRAC, seed = seq2seq_SEED)

seq2seq_input_data.describe()

statistic,it,en
str,f64,f64
"""count""",192008.0,192008.0
"""null_count""",0.0,0.0
"""mean""",,
"""std""",,
"""min""",,
"""25%""",,
"""50%""",,
"""75%""",,
"""max""",,


In [8]:
def build_vocab(tokens):
    vocab = defaultdict(lambda: len(vocab))  # Assign unique indices
    vocab["<pad>"]  # Reserve 0 for padding
    vocab["<unk>"]
    vocab["<eos>"]
    vocab["<bos>"]
    for token_list in tokens:
        for token in token_list:
            _ = vocab[token]
    return dict(vocab)

def preprocess_data(df):
    it_vocab = build_vocab(df["it"])
    en_vocab = build_vocab(df["en"])
    
    it_indices = [
        torch.tensor([it_vocab[token] for token in tokens], dtype=torch.long)
        for tokens in df["it"]
    ]
    en_indices = [
        torch.tensor([en_vocab[token] for token in tokens], dtype=torch.long)
        for tokens in df["en"]
    ]
    return list(zip(en_indices, it_indices)), len(it_vocab), len(en_vocab), en_vocab, it_vocab

In [9]:
seq2_seq_data_pairs, seq2seq_IT_VOCAB_SIZE, seq2seq_EN_VOCAB_SIZE, seq2seq_en_v, seq2seq_it_v = preprocess_data(seq2seq_input_data)

In [10]:
transformer_data_pairs, transformer_IT_VOCAB_SIZE, transformer_EN_VOCAB_SIZE, transformer_en_v, transformer_it_v = preprocess_data(transformer_input_data)

In [11]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, hidden, cell):
        trg = trg.unsqueeze(1)
        embedded = self.dropout(self.embedding(trg))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        trg_input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(trg_input, hidden, cell)
            outputs[:, t, :] = output
            top1 = output.argmax(1)
            trg_input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

In [13]:
seq2seq_encoder = Encoder(seq2seq_EN_VOCAB_SIZE, seq2seq_EMB_DIM, seq2seq_HID_DIM, seq2seq_N_LAYERS, seq2seq_DROPOUT)
seq2seq_decoder = Decoder(seq2seq_IT_VOCAB_SIZE, seq2seq_EMB_DIM, seq2seq_HID_DIM, seq2seq_N_LAYERS, seq2seq_DROPOUT)
seq2seq_model = Seq2Seq(seq2seq_encoder, seq2seq_decoder, device).to(device)

seq2seq_model.load_state_dict(torch.load("/kaggle/input/dec-enc-test/seq2seq_translation_model.pth", weights_only=True))

<All keys matched successfully>

In [14]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, emb_dim, n_heads, ff_dim, n_layers, dropout):
        super().__init__()
        # Ensure src_vocab_size matches the vocabulary size from preprocessing
        self.src_embedding = nn.Embedding(src_vocab_size, emb_dim)
        # Ensure trg_vocab_size matches the vocabulary size from preprocessing
        self.trg_embedding = nn.Embedding(trg_vocab_size, emb_dim)
        # Check that the positional encoding length covers all sequence lengths
        self.positional_encoding = self._get_positional_encoding(emb_dim)

        self.transformer = nn.Transformer(
            d_model=emb_dim,
            nhead=n_heads,
            num_encoder_layers=n_layers,
            num_decoder_layers=n_layers,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(emb_dim, trg_vocab_size)

    def forward(self, src, trg):
        src_seq_len = src.size(1)  # Sequence length of source
        src = self.src_embedding(src) + self.positional_encoding[:, :src_seq_len, :].to(src.device)
        trg_seq_len = trg.size(1)  # Sequence length of target
        trg = self.trg_embedding(trg) + self.positional_encoding[:, :trg_seq_len, :].to(trg.device)

        src_mask = self._generate_square_subsequent_mask(src_seq_len).to(src.device)
        trg_mask = self._generate_square_subsequent_mask(trg_seq_len).to(trg.device)
        memory_mask = None

        # Ensure src, trg, and masks are aligned
        output = self.transformer(
            src, trg, src_mask=src_mask, tgt_mask=trg_mask, memory_mask=memory_mask
        )
        output = self.fc_out(output)

        return output

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.full((sz, sz), float('-inf')), diagonal=1)

    def _get_positional_encoding(self, emb_dim, max_len=5000):
        pe = torch.zeros(1, max_len, emb_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * -(torch.log(torch.tensor(10000.0)) / emb_dim))
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        return pe

In [15]:
transformer_model = Transformer(transformer_EN_VOCAB_SIZE, transformer_IT_VOCAB_SIZE, transformer_EMB_DIM, transformer_N_HEADS, transformer_FF_DIM, transformer_N_LAYERS, transformer_DROPOUT).to(device)

transformer_model.load_state_dict(torch.load("/kaggle/input/transformer-translator/transformer_translation_model.pth", weights_only=True))

<All keys matched successfully>

In [16]:
seq2seq_it_v_inv = {v: k for k, v in seq2seq_it_v.items()}

In [17]:
transformer_it_v_inv = {v: k for k, v in transformer_it_v.items()}

In [18]:
def tokenize(sentence, vocab):
    tokens = sentence.lower().split()  # Basic tokenization; adjust if needed
    return torch.tensor([vocab.get(token, vocab["<unk>"]) for token in tokens], dtype=torch.long)

In [19]:
def seq2seq_translate(sentence, model, en_vocab, it_vocab, it_vocab_inv, device="cuda"):
    model.eval()
    model.to(device)

    # Tokenize and convert the input sentence into a tensor
    input_tensor = tokenize(sentence, en_vocab).unsqueeze(0).to(device)

    # Pass the source sentence through the encoder
    hidden, cell = model.encoder(input_tensor)

    # Initialize the decoder input with the <bos> token
    decoder_input = torch.tensor([it_vocab["<bos>"]], dtype=torch.long, device=device)

    # Store decoded tokens
    decoded_tokens = []

    for _ in range(50):  # Limit translation to 50 tokens
        # Pass through the decoder
        output, hidden, cell = model.decoder(decoder_input, hidden, cell)

        # Get the predicted token
        next_token_idx = output.argmax(1).item()

        # Stop if <eos> is generated
        if next_token_idx == it_vocab["<eos>"]:
            break

        # Append the token to the output sequence
        decoded_tokens.append(it_vocab_inv[next_token_idx])

        # Update the decoder input to the predicted token
        decoder_input = torch.tensor([next_token_idx], dtype=torch.long, device=device)

    # Join decoded tokens to form the translated sentence
    return " ".join(decoded_tokens)

In [20]:
def transformer_translate(sentence, model, en_vocab, it_vocab, it_vocab_inv, device="cuda"):
    model.eval()
    model.to(device)

    # Tokenize and convert the input sentence into a tensor
    input_tensor = tokenize(sentence, en_vocab).unsqueeze(0).to(device)
    src_seq_len = input_tensor.size(1)
    src_mask = model.generate_square_subsequent_mask(src_seq_len).to(device)

    # Pass the source sentence through the transformer
    src = model.src_embedding(input_tensor) + model.positional_encoding[:, :src_seq_len, :].to(device)
    memory = model.transformer.encoder(src, mask=src_mask)

    # Initialize the decoder input with the <bos> token
    decoder_input = torch.tensor([it_vocab["<bos>"]], dtype=torch.long, device=device).unsqueeze(0)
    decoded_tokens = []

    for _ in range(50):  # Limit translation to 50 tokens
        tgt_seq_len = decoder_input.size(1)
        tgt_mask = model.generate_square_subsequent_mask(tgt_seq_len).to(device)

        trg = model.trg_embedding(decoder_input) + model.positional_encoding[:, :tgt_seq_len, :].to(device)
        output = model.transformer.decoder(trg, memory, tgt_mask=tgt_mask)
        output = model.fc_out(output)

        # Get the next token
        next_token_idx = output[:, -1, :].argmax(1).item()

        # Stop if <eos> is predicted
        if next_token_idx == it_vocab["<eos>"]:
            break

        decoded_tokens.append(it_vocab_inv[next_token_idx])

        # Append the predicted token to the decoder input
        decoder_input = torch.cat(
            [decoder_input, torch.tensor([[next_token_idx]], dtype=torch.long, device=device)], dim=1
        )

    # Join decoded tokens to form the translated sentence
    return " ".join(decoded_tokens)


In [21]:
def translate(sentence, model_type="seq2seq", device="cuda"):
    if model_type == "seq2seq":
        return seq2seq_translate(sentence, seq2seq_model, seq2seq_en_v, seq2seq_it_v, seq2seq_it_v_inv, device)
    elif model_type == "transformer":
        return transformer_translate(sentence, transformer_model, transformer_en_v, transformer_it_v, transformer_it_v_inv, device)
    else:
        raise ValueError("Invalid model_type. Choose 'seq2seq' or 'transformer'.")

In [22]:
sentence = "How are you doing?"

In [23]:
seq2seq_translation = translate(sentence)

In [24]:
print("Sentence Translation (Using Seq2Seq Model): ", seq2seq_translation)

Sentence Translation (Using Seq2Seq Model):  


In [25]:
transformer_translation = translate(sentence, model_type = "transformer")

In [26]:
print("Sentence Translation (Using Transformer Model): ", transformer_translation)

Sentence Translation (Using Transformer Model):  ares americano
