In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from collections import Counter
from itertools import chain
from nltk.tokenize import word_tokenize  # Ensure NLTK is installed
from sklearn.metrics import accuracy_score
import os
import json
import torch
import random

In [None]:
### GLOBAL VARIABLES ###
SEED = 28
SPECIAL_TOKENS = {'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNKNOWN>': 3}
MAX_LEN = 17
### END OF GLOBAL VARIABLES ###

# set random seed
random.seed(SEED)
torch.manual_seed(SEED)  # Seed for CPU computations
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)  # Seed for GPU computations

In [2]:
# Load in data
test_data = pd.read_csv('data_jesc/test', sep='\t')
train_data = pd.read_csv('data_jesc/train', sep='\t')
print(test_data.shape)
print(train_data.shape)

# Shuffle and take 50% of the data
train_data = train_data.sample(frac=1, random_state=42).head(int(train_data.shape[0] * 0.01))
test_data = test_data.sample(frac=1, random_state=42).head(int(test_data.shape[0] * 0.30))
print(test_data.shape)
print(train_data.shape)

# Adjust columns
test_data.columns = ['ENG', 'JPN']
train_data.columns = ['ENG', 'JPN']

# Tokenize English and Japanese sentences
test_data['ENG'] = test_data['ENG'].apply(word_tokenize)
test_data['JPN'] = test_data['JPN'].apply(word_tokenize)
train_data['ENG'] = train_data['ENG'].apply(word_tokenize)
train_data['JPN'] = train_data['JPN'].apply(word_tokenize)

# Analyze sentence lengths
all_lengths = pd.concat([
    train_data['ENG'].apply(len),
    train_data['JPN'].apply(len),
    test_data['ENG'].apply(len),
    test_data['JPN'].apply(len)
])

# Choose a percentile for MAX_LEN (e.g., 95th percentile)
MAX_LEN = int(all_lengths.quantile(0.95))
print("Using 95th percentile for MAX_LEN:", MAX_LEN)

SPECIAL_TOKENS = {'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNKNOWN>': 3}

# Adjust build_vocab to limit vocabulary size
def build_vocab(tokenized_data, special_tokens, max_vocab_size=5000):
    tokenized_data = tokenized_data.explode()  # Flatten the DataFrame column
    vocab_counter = Counter(tokenized_data)
    most_common = vocab_counter.most_common(max_vocab_size)  # Keep only the top `max_vocab_size` words
    vocab = {word: idx + len(special_tokens) for idx, (word, _) in enumerate(most_common)}
    vocab.update(special_tokens)
    return vocab

eng_vocab = build_vocab(train_data['ENG'], SPECIAL_TOKENS, max_vocab_size=5000)
jpn_vocab = build_vocab(train_data['JPN'], SPECIAL_TOKENS, max_vocab_size=5000)

# Token-to-index conversion
def safe_tokens_to_indices(tokens, vocab, sos_eos=True):
    indices = [vocab.get(token, vocab['<UNKNOWN>']) for token in tokens]
    if sos_eos:
        indices = [vocab['<START>']] + indices + [vocab['<END>']]
    return indices

def pad_sequence(sequence, max_len=MAX_LEN, pad_value=0):
    return sequence[:max_len] + [pad_value] * max(0, max_len - len(sequence))

# Preprocess data
train_data['ENG'] = train_data['ENG'].apply(lambda x: pad_sequence(safe_tokens_to_indices(x, eng_vocab)))
train_data['JPN'] = train_data['JPN'].apply(lambda x: pad_sequence(safe_tokens_to_indices(x, jpn_vocab)))
test_data['ENG'] = test_data['ENG'].apply(lambda x: pad_sequence(safe_tokens_to_indices(x, eng_vocab)))
test_data['JPN'] = test_data['JPN'].apply(lambda x: pad_sequence(safe_tokens_to_indices(x, jpn_vocab)))


(1992, 2)
(2371921, 2)
(597, 2)
(23719, 2)
Using 95th percentile for MAX_LEN: 17


In [None]:
# Dataset Class
class TranslationDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = src_data
        self.tgt_data = tgt_data
    def __len__(self):
        return len(self.src_data)
    def __getitem__(self, idx):
        return torch.tensor(self.src_data[idx], dtype=torch.long), torch.tensor(self.tgt_data[idx], dtype=torch.long)

### MODELS ###
# LSTM Seq2Seq Model
class LSTMSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, hidden_dim, n_layers, dropout):
        super(LSTMSeq2Seq, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embed_dim = embed_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.embedding_src = nn.Embedding(input_dim, embed_dim)
        self.embedding_tgt = nn.Embedding(output_dim, embed_dim)
        self.encoder = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.decoder = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
    def forward(self, src, tgt):
        src_embedded = self.embedding_src(src)
        _, (hidden, cell) = self.encoder(src_embedded)
        tgt_embedded = self.embedding_tgt(tgt)
        outputs, _ = self.decoder(tgt_embedded, (hidden, cell))
        return self.fc_out(outputs)

# CNN Seq2Seq Model
class CNNSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, kernel_size, num_channels):
        super(CNNSeq2Seq, self).__init__()
        self.kernel_size = kernel_size
        self.num_channels = num_channels
        self.embed_dim = embed_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embedding_src = nn.Embedding(input_dim, embed_dim)
        self.embedding_tgt = nn.Embedding(output_dim, embed_dim)
        self.encoder = nn.Conv1d(embed_dim, num_channels, kernel_size, padding=kernel_size // 2)
        self.decoder = nn.Conv1d(num_channels + embed_dim, num_channels, kernel_size, padding=kernel_size // 2)
        self.fc_out = nn.Linear(num_channels, output_dim)
    def forward(self, src, tgt):
        # Shape: (batch, seq_len, embed_dim) -> (batch, embed_dim, seq_len)
        src_embedded = self.embedding_src(src).permute(0, 2, 1)  
        tgt_embedded = self.embedding_tgt(tgt).permute(0, 2, 1)  
        # Encoder outputs
        encoder_outputs = self.encoder(src_embedded)  # Shape: (batch, num_channels, seq_len)
        # Ensure tgt_embedded matches encoder_outputs in sequence length
        tgt_embedded = tgt_embedded[:, :, :encoder_outputs.size(2)]  # Truncate if necessary
        if tgt_embedded.size(2) < encoder_outputs.size(2):  # Pad if necessary
            pad_size = encoder_outputs.size(2) - tgt_embedded.size(2)
            tgt_embedded = torch.nn.functional.pad(tgt_embedded, (0, pad_size))
        # Concatenate along the channel dimension
        decoder_inputs = torch.cat((encoder_outputs, tgt_embedded), dim=1)  # Shape: (batch, num_channels + embed_dim, seq_len)
        decoder_outputs = self.decoder(decoder_inputs)  # Shape: (batch, num_channels, seq_len)
        # Final output: (batch, seq_len, output_dim)
        return self.fc_out(decoder_outputs.permute(0, 2, 1))

# Transformer Seq2Seq Model
class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, n_heads, num_layers, dropout):
        super(TransformerSeq2Seq, self).__init__()
        self.embedding_src = nn.Embedding(input_dim, embed_dim)
        self.embedding_tgt = nn.Embedding(output_dim, embed_dim)
        self.transformer = nn.Transformer(embed_dim, n_heads, num_layers, num_layers, dropout=dropout)
        self.fc_out = nn.Linear(embed_dim, output_dim)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_heads = n_heads
        self.num_layers = num_layers
        self.dropout = dropout
        self.embed_dim = embed_dim
    def forward(self, src, tgt):
        src_embedded = self.embedding_src(src).permute(1, 0, 2)  # Shape: (seq_len, batch, embed_dim)
        tgt_embedded = self.embedding_tgt(tgt).permute(1, 0, 2)  # Shape: (seq_len, batch, embed_dim)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_embedded.size(0)).to(src.device)
        output = self.transformer(src_embedded, tgt_embedded, tgt_mask=tgt_mask)
        return self.fc_out(output.permute(1, 0, 2))  # Shape: (batch, seq_len, output_dim)

### END OF MODELS ###

# train and test
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1]) # predict exclude last token
        # align dimensions for loss calculation
        if output.size(1) != tgt[:, 1:].size(1):
            min_seq_len = min(output.size(1), tgt[:, 1:].size(1))
            output = output[:, :min_seq_len, :]
            tgt = tgt[:, :min_seq_len + 1]
        # get loss
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))  # Exclude first token from target
        loss.backward()
        optimizer.step()
        # Calculate acc
        preds = output.argmax(dim=-1)  # Get the predicted token indices
        correct = (preds == tgt[:, 1:]).float()  # Compare predictions to target tokens (shifted by 1)
        total_correct += correct.sum().item()
        total_samples += tgt[:, 1:].numel()  # Total number of target tokens
        total_loss += loss.item()
    # Return avg acc and loss
    avg_loss = total_loss / len(train_loader)
    avg_accuracy = total_correct / total_samples
    return avg_loss, avg_accuracy


def evaluate_model(model, test_loader, criterion, device):
    """
    Tests model performance based on test_loader
    """
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for src, tgt in test_loader:
            src, tgt = src.to(device), tgt.to(device)
            
            # Forward pass
            output = model(src, tgt[:, :-1])  # Predict target sequence excluding the last token
            
            # Align sequence lengths between output and target
            if output.size(1) != tgt[:, 1:].size(1):
                min_seq_len = min(output.size(1), tgt[:, 1:].size(1))
                output = output[:, :min_seq_len, :]
                tgt = tgt[:, :min_seq_len + 1]
            
            # Compute loss
            loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))

            # Calculate accuracy
            preds = output.argmax(dim=-1)  # Get the predicted token indices
            correct = (preds == tgt[:, 1:]).float()  # Compare predictions to target tokens (shifted by 1)
            total_correct += correct.sum().item()
            total_samples += tgt[:, 1:].numel()  # Total number of target tokens
            total_loss += loss.item()

    avg_loss = total_loss / len(test_loader)
    avg_accuracy = total_correct / total_samples
    return avg_loss, avg_accuracy


In [None]:

class GRUSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, hidden_dim, n_layers, dropout):
        """
        GRU-based Seq2Seq model for machine translation.

        Args:
            input_dim (int): Size of the input vocabulary.
            output_dim (int): Size of the output vocabulary.
            embed_dim (int): Embedding dimension.
            hidden_dim (int): Hidden state dimension.
            n_layers (int): Number of GRU layers.
            dropout (float): Dropout rate.
        """
        super(GRUSeq2Seq, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim 
        self.embed_dim = embed_dim 
        self.hidden_dim = hidden_dim 
        self.n_layers = n_layers 
        self.dropout = dropout

        self.embedding_src = nn.Embedding(input_dim, embed_dim)
        self.embedding_tgt = nn.Embedding(output_dim, embed_dim)
        self.encoder = nn.GRU(embed_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.decoder = nn.GRU(embed_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, tgt):
        """
        Forward pass for GRU-based Seq2Seq model.
        """
        src_embedded = self.dropout(self.embedding_src(src))  # (batch_size, src_seq_len, embed_dim)
        tgt_embedded = self.dropout(self.embedding_tgt(tgt))  # (batch_size, tgt_seq_len, embed_dim)
        _, hidden = self.encoder(src_embedded)  # hidden: (n_layers, batch_size, hidden_dim)
        decoder_outputs, _ = self.decoder(tgt_embedded, hidden)  # (batch_size, tgt_seq_len, hidden_dim)
        outputs = self.fc_out(decoder_outputs)  # (batch_size, tgt_seq_len, output_dim)
        return outputs


In [None]:
### TRAINING AND TESTING ###
train_dataset = TranslationDataset(train_data['ENG'].tolist(), train_data['JPN'].tolist())
test_dataset = TranslationDataset(test_data['ENG'].tolist(), test_data['JPN'].tolist())
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

models = {
    'Transformer_Seq2Seq': TransformerSeq2Seq(len(eng_vocab), len(jpn_vocab), 15, 3, 3, 0.2),
    'CNN_Seq2Seq': CNNSeq2Seq(len(eng_vocab), len(jpn_vocab), 24, kernel_size=3, num_channels=5),
    'LSTM_Seq2Seq': LSTMSeq2Seq(len(eng_vocab), len(jpn_vocab), 24, 15, 3, 0.2),
    'GRU_Seq2Seq': GRUSeq2Seq(len(eng_vocab), len(jpn_vocab), 24, 4, 3, 0.2),
}

models2 = {
    'CNN_Seq2Seq': CNNSeq2Seq(len(eng_vocab), len(jpn_vocab), 24, kernel_size=4, num_channels=5),
}

results = {}
EPOCHS = 50

for name, model in models.items():
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 1e-4)
    criterion = nn.CrossEntropyLoss(ignore_index=SPECIAL_TOKENS['<PAD>'])
    print(f"Training {name}...")
    
    for epoch in range(EPOCHS):  # Adjust number of epochs
        train_loss, train_accuracy = train_model(model, train_loader, optimizer, criterion, device)
        if epoch % 10 == 0:
            print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_accuracy:.4f}")
    test_loss, test_accuracy = evaluate_model(model, test_loader, criterion, device)
    print(f"{name} Test Loss = {test_loss:.4f}, Test Accuracy = {test_accuracy:.4f}")
    results[name] = (test_loss, test_accuracy)

# Print comparison results
print("\nModel Comparison:")
for model_name, (loss, accuracy) in results.items():
    print(f"{model_namgit ae}: Test Loss = {loss:.4f}, Test Accuracy = {accuracy:.4f}")




Training Transformer_Seq2Seq...
Epoch 1: Train Loss = 5.6947, Train Accuracy = 0.0591
Epoch 11: Train Loss = 5.6223, Train Accuracy = 0.0598
Epoch 21: Train Loss = 5.6219, Train Accuracy = 0.0600
Epoch 31: Train Loss = 5.6233, Train Accuracy = 0.0599


In [None]:
def save_vocab(vocab, path):
    """
    Save a vocabulary dictionary to a JSON file.

    Args:
        vocab (dict): The vocabulary dictionary to save.
        path (str): The file path to save the vocabulary.
    """
    with open(path, 'w') as f:
        json.dump(vocab, f, ensure_ascii=False, indent=4)  # Use ensure_ascii=False for non-ASCII tokens

def load_vocab(path):
    """
    Load a vocabulary dictionary from a JSON file.

    Args:
        path (str): The file path to load the vocabulary from.

    Returns:
        dict: The loaded vocabulary dictionary.
    """
    with open(path, 'r') as f:
        return json.load(f)


# Save English and Japanese vocabularies
save_vocab(eng_vocab, 'saved/eng_vocab.json')
save_vocab(jpn_vocab, 'saved/jpn_vocab.json')

print("Vocabularies saved successfully!")

# Load English and Japanese vocabularies
eng_vocab = load_vocab('saved/eng_vocab.json')
jpn_vocab = load_vocab('saved/jpn_vocab.json')

print("Vocabularies loaded successfully!")

Vocabularies saved successfully!
Vocabularies loaded successfully!


In [None]:
### SAVING AND LOADING MODELS ###
def save_model_and_params(models, save_dir):
    """
    Save models and their parameters.
    Args:
        models (dict): Dictionary containing model names and model objects.
        save_dir (str): Directory to save the models and parameters.
    """
    os.makedirs(save_dir, exist_ok=True)

    for name, model in models.items():
        # Extract model parameters
        if isinstance(model, TransformerSeq2Seq):
            params = {
                "model_type": "Transformer",
                "input_dim": model.input_dim,
                "output_dim": model.output_dim,
                "embed_dim": model.fc_out.in_features, # model.embed_dim
                "n_heads": model.n_heads,
                "num_layers": model.num_layers,
                "dropout": model.dropout
            }

        elif isinstance(model, CNNSeq2Seq):
            params = {
                "model_type": "CNN",
                "input_dim": model.input_dim,
                "output_dim": model.output_dim,
                "embed_dim": model.embed_dim,
                "kernel_size": model.kernel_size,
                "num_channels": model.num_channels
            }
        elif isinstance(model, GRUSeq2Seq):
            # input_dim, output_dim, embed_dim, hidden_dim, n_layers, dropout
            params = {
                "model_type": "GRU",
                "input_dim": model.input_dim,
                "output_dim": model.output_dim,
                "embed_dim": model.embed_dim,
                "hidden_dim": model.hidden_dim,
                "n_layers": model.n_layers,
                "dropout": model.dropout
            }

        elif isinstance(model, LSTMSeq2Seq):
            params = {
                "model_type": "LSTM",
                "input_dim": model.input_dim,
                "output_dim": model.output_dim,
                "embed_dim": model.embed_dim,
                "hidden_dim": model.hidden_dim,
                "n_layers": model.n_layers,
                "dropout": model.dropout
            }
        else:
            raise ValueError(f"Unknown model type for {name}")

        # Save parameters as JSON
        params_path = os.path.join(save_dir, f"{name}_params.json")
        with open(params_path, 'w') as f:
            json.dump(params, f)

        # Save model weights
        weights_path = os.path.join(save_dir, f"{name}_weights.pt")
        torch.save(model.state_dict(), weights_path)

        print(f"Saved {name} to {params_path} and {weights_path}.")

# Function to load model parameters
def load_model_params(params_path):
    """
    Load model parameters from a JSON file.
    Args:
        params_path (str): Path to the JSON file containing model parameters.
    Returns:
        dict: Dictionary of model parameters.
    """
    with open(params_path, 'r') as f:
        return json.load(f)

# Function to load a model dynamically based on its parameters
def load_model(model_name, save_dir, device):
    """
    Dynamically load a model based on its type and saved parameters.
    Args:
        model_name (str): Name of the model to load.
        save_dir (str): Directory where the model files are stored.
        device (torch.device): Device to load the model onto.
    Returns:
        nn.Module: The loaded model.
    """
    # Paths to parameters and weights
    params_path = os.path.join(save_dir, f"{model_name}_params.json")
    weights_path = os.path.join(save_dir, f"{model_name}_weights.pt")

    # Load parameters
    params = load_model_params(params_path)

    # Reconstruct the model based on its type
    if params["model_type"] == "Transformer":
        model = TransformerSeq2Seq(
            params["input_dim"],
            params["output_dim"],
            params["embed_dim"],
            params["n_heads"],
            params["num_layers"],
            params["dropout"]
        )
    elif params["model_type"] == "GRU":
        model = GRUSeq2Seq(
            params["input_dim"],
            params["output_dim"],
            params["embed_dim"],
            params["hidden_dim"],
            params["n_layers"],
            params["dropout"]
        )

    elif params["model_type"] == "CNN":
        model = CNNSeq2Seq(
            params["input_dim"],
            params["output_dim"],
            params["embed_dim"],
            params["kernel_size"],
            params["num_channels"]
        )
    elif params["model_type"] == "LSTM":
        model = LSTMSeq2Seq(
            params["input_dim"],
            params["output_dim"],
            params["embed_dim"],
            params["hidden_dim"],
            params["n_layers"],
            params["dropout"]
        )
    else:
        raise ValueError(f"Unknown model type: {params['model_type']}")

    # Load weights into the model
    model.load_state_dict(torch.load(weights_path, map_location=device))
    model.to(device)

    print(f"Loaded {model_name} ({params['model_type']}) from {weights_path}.")
    return model

# Save models and parameters
save_dir = "saved"
save_model_and_params(models, save_dir)

# Load a specific model dynamically
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "LSTM_Seq2Seq"  # Example: Change to "Transformer_Seq2Seq" or "CNN_Seq2Seq"
loaded_model = load_model(model_name, save_dir, device)


Saved Transformer_Seq2Seq to saved/Transformer_Seq2Seq_params.json and saved/Transformer_Seq2Seq_weights.pt.
Saved CNN_Seq2Seq to saved/CNN_Seq2Seq_params.json and saved/CNN_Seq2Seq_weights.pt.
Saved LSTM_Seq2Seq to saved/LSTM_Seq2Seq_params.json and saved/LSTM_Seq2Seq_weights.pt.
Loaded LSTM_Seq2Seq (LSTM) from saved/LSTM_Seq2Seq_weights.pt.




In [79]:
def translate_sentence(sentence, model, device):
    """
    Translates a sentence using the given model.

    Args:
        sentence (str): The input sentence to translate.
        model (nn.Module): The trained translation model with attributes:
            - src_vocab (dict): Source vocabulary.
            - tgt_vocab (dict): Target vocabulary.
            - device (torch.device): Device to run the model on.
            - max_len (int): Maximum sequence length for translation.

    Returns:
        str: Translated sentence.
    """
    # Ensure the model is in evaluation mode
    model.eval()

    # load in vocb
    src_vocab = load_vocab('saved/eng_vocab.json')
    tgt_vocab = load_vocab('saved/jpn_vocab.json')

    # Access vocabularies, device, and max_len from the model
    max_len = MAX_LEN

    # Tokenize the input sentence
    tokens = word_tokenize(sentence)

    # Convert tokens to indices
    src_indices = [src_vocab.get(token, src_vocab['<UNKNOWN>']) for token in tokens]
    src_indices = [src_vocab['<START>']] + src_indices + [src_vocab['<END>']]

    # Convert to tensor and add batch dimension
    src_tensor = torch.tensor([src_indices], dtype=torch.long, device=device)

    # Prepare initial target input (start with <START> token)
    tgt_indices = [tgt_vocab['<START>']]
    tgt_tensor = torch.tensor([tgt_indices], dtype=torch.long, device=device)

    for _ in range(max_len):
        # Generate predictions
        with torch.no_grad():
            output = model(src_tensor, tgt_tensor)

        # Get the next token (argmax of the last output step)
        next_token = output[:, -1, :].argmax(dim=-1).item()

        # Stop if <END> token is predicted
        if next_token == tgt_vocab['<END>']:
            break

        # Add the predicted token to the target tensor
        tgt_indices.append(next_token)
        tgt_tensor = torch.tensor([tgt_indices], dtype=torch.long, device=device)

    # Convert target indices to tokens
    tgt_vocab_inv = {idx: token for token, idx in tgt_vocab.items()}
    translated_tokens = [tgt_vocab_inv[idx] for idx in tgt_indices[1:]]  # Skip the <START> token
    return ' '.join(translated_tokens)


In [None]:
# Load a specific model dynamically
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "CNN_Seq2Seq" 
loaded_model = load_model(model_name, save_dir, device)

Loaded CNN_Seq2Seq (CNN) from saved/CNN_Seq2Seq_weights.pt.


In [86]:
sentence = 'How are you?'
translate_sentence(sentence, loaded_model, device)

'出 出 出 出 こんなに 植物 植物 植物 植物 植物 植物 植物 植物 植物 植物'

---

In [None]:
import pandas as pd
import numpy as np
import sklearn
import torch
import torch.optim as optim
from collections import Counter
from itertools import chain
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import nltk
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
nltk.download('punkt')

Preprocessing Data

In [None]:
# load in the data
test_data = pd.read_csv('data_jesc/test', sep = '\t')
train_data = pd.read_csv('data_jesc/train', sep = '\t')

# adjust columns
test_data.columns = ['ENG', 'JPN']
train_data.columns = ['ENG', 'JPN']

# Tokenize both English and Japanese sentences
test_data['ENG'] = test_data['ENG'].apply(word_tokenize)
test_data['JPN'] = test_data['JPN'].apply(word_tokenize)

train_data['ENG'] = train_data['ENG'].apply(word_tokenize)
train_data['JPN'] = train_data['JPN'].apply(word_tokenize)

Data Processing

In [None]:
# Define special tokens
SPECIAL_TOKENS = {'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNKNOWN>': 3}

# Build vocabulary
def build_vocab(tokenized_data, special_tokens=SPECIAL_TOKENS):
    MIN_FREQ = 5  # Minimum frequency for a word to be included in the vocabulary
    vocab_counter = Counter(chain(*tokenized_data))
    pruned_vocab = {k: v for k, v in vocab_counter.items() if v >= MIN_FREQ}
    vocab = {word: idx + len(special_tokens) for idx, (word, _) in enumerate(pruned_vocab.items())}
    vocab.update(special_tokens)
    return vocab

# Function to convert tokens to indices
def tokens_to_indices(tokens, vocab, sos_eos=True):
    indices = [vocab.get(token, vocab['<UNKNOWN>']) for token in tokens]
    if sos_eos:
        indices = [vocab['<START>']] + indices + [vocab['<END>']]
    return indices

# Convert tokens to indices and ensure indices are within range
def safe_tokens_to_indices(tokens, vocab, sos_eos=True):
    indices = [vocab.get(token, vocab['<UNKNOWN>']) for token in tokens]
    if sos_eos:
        indices = [vocab['<START>']] + indices + [vocab['<END>']]
    return indices

# Pad sequence to a fixed length
MAX_LEN = 15
def pad_sequence(sequence, max_len=MAX_LEN, pad_value=SPECIAL_TOKENS['<PAD>']):
    return sequence[:max_len] + [pad_value] * max(0, max_len - len(sequence))

# Apply the above functions to preprocess the data
def preprocess_data(data, vocab, max_len=MAX_LEN):
    return data.apply(lambda x: pad_sequence(safe_tokens_to_indices(x, vocab)))

# Build vocabularies for English and Japanese
eng_vocab = build_vocab(train_data['ENG'])
jpn_vocab = build_vocab(train_data['JPN'])

# Preprocess train and test data
train_data['ENG'] = preprocess_data(train_data['ENG'], eng_vocab)
train_data['JPN'] = preprocess_data(train_data['JPN'], jpn_vocab)

test_data['ENG'] = preprocess_data(test_data['ENG'], eng_vocab)
test_data['JPN'] = preprocess_data(test_data['JPN'], jpn_vocab)

# Check language vocabulary size
eng_words = len(eng_vocab)
jpn_words = len(jpn_vocab)
print("English vocabulary has {} unique words".format(eng_words))
print("Japanese vocabulary has {} unique words".format(jpn_words))

# Dataset and DataLoader
class TranslationDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = src_data
        self.tgt_data = tgt_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        return torch.tensor(self.src_data[idx], dtype=torch.long), torch.tensor(self.tgt_data[idx], dtype=torch.long)

# Create Dataset and DataLoader
train_dataset = TranslationDataset(train_data['ENG'].tolist(), train_data['JPN'].tolist())
test_dataset = TranslationDataset(test_data['ENG'].tolist(), test_data['JPN'].tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Verify a batch from DataLoader
for src, tgt in train_loader:
    print("Source batch shape:", src.shape)
    print("Target batch shape:", tgt.shape)
    break


In [None]:
# Training and Evaluation
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        output = output.reshape(-1, output.shape[-1])
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions, actuals = [], []
    with torch.no_grad():
        for src, tgt in test_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output = output.argmax(dim=-1)
            predictions.extend(output.cpu().tolist())
            actuals.extend(tgt[:, 1:].cpu().tolist())
    return accuracy_score(actuals, predictions)

In [None]:
### MODELS ###

# Seq2Seq Models
# LSTM Seq2Seq
class LSTMSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, hidden_dim, n_layers, dropout):
        super(LSTMSeq2Seq, self).__init__()
        self.embedding_src = nn.Embedding(input_dim, embed_dim)  # Source language embedding
        self.embedding_tgt = nn.Embedding(output_dim, embed_dim)  # Target language embedding
        self.encoder = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.decoder = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)  # Final output layer


    def forward(self, src, tgt):
        src_embedded = self.embedding(src)
        _, (hidden, cell) = self.encoder(src_embedded)
        tgt_embedded = self.embedding(tgt)
        outputs, _ = self.decoder(tgt_embedded, (hidden, cell))
        return self.fc_out(outputs)

# Transformer Seq2Seq
class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, n_heads, num_layers, dropout):
        super(TransformerSeq2Seq, self).__init__()
        self.embedding_src = nn.Embedding(input_dim, embed_dim)
        self.embedding_tgt = nn.Embedding(output_dim, embed_dim)
        self.transformer = nn.Transformer(embed_dim, n_heads, num_layers, num_layers, dropout=dropout)
        self.fc_out = nn.Linear(embed_dim, output_dim)

    def forward(self, src, tgt):
        src_embedded = self.embedding_src(src).permute(1, 0, 2)  # (seq_len, batch, embed_dim)
        tgt_embedded = self.embedding_tgt(tgt).permute(1, 0, 2)
        output = self.transformer(src_embedded, tgt_embedded)
        return self.fc_out(output.permute(1, 0, 2))

# CNN Seq2Seq
class CNNSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, kernel_size, num_channels):
        super(CNNSeq2Seq, self).__init__()
        self.embedding_src = nn.Embedding(input_dim, embed_dim)
        self.embedding_tgt = nn.Embedding(output_dim, embed_dim)
        self.encoder = nn.Conv1d(embed_dim, num_channels, kernel_size, padding=kernel_size // 2)
        self.decoder = nn.Conv1d(num_channels + embed_dim, num_channels, kernel_size, padding=kernel_size // 2)
        self.fc_out = nn.Linear(num_channels, output_dim)

    def forward(self, src, tgt):
        src_embedded = self.embedding_src(src).permute(0, 2, 1)
        encoder_outputs = self.encoder(src_embedded)
        tgt_embedded = self.embedding_tgt(tgt).permute(0, 2, 1)
        decoder_inputs = torch.cat((encoder_outputs, tgt_embedded), dim=1)
        decoder_outputs = self.decoder(decoder_inputs).permute(0, 2, 1)
        return self.fc_out(decoder_outputs)

In [None]:
print("Max index in train_data['ENG']:", max(chain(*train_data['ENG'])))
print("Max index in train_data['JPN']:", max(chain(*train_data['JPN'])))
print("Max index in test_data['ENG']:", max(chain(*test_data['ENG'])))
print("Max index in test_data['JPN']:", max(chain(*test_data['JPN'])))
print("English vocab size:", len(eng_vocab))
print("Japanese vocab size:", len(jpn_vocab))

In [None]:
# Debug: Print a batch from the DataLoader
for src, tgt in train_loader:
    print("Source batch sample:", src[0])
    print("Target batch sample:", tgt[0])
    print("Max source index in batch:", src.max().item())
    print("Max target index in batch:", tgt.max().item())
    break


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Compare models
models = {
    'LSTM_Seq2Seq': LSTMSeq2Seq(len(eng_vocab), len(jpn_vocab), 100, 100, 2, 0.5),
    'Transformer_Seq2Seq': TransformerSeq2Seq(len(eng_vocab), len(jpn_vocab), 100, 100, 2, 0.5),
    'CNN_Seq2Seq': CNNSeq2Seq(len(eng_vocab), len(jpn_vocab), 100, 2, 100)
}

results = {}

for name, model in models.items():
    model.to(device)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=SPECIAL_TOKENS['<PAD>'])
    print(f"Training {name}...")
    for epoch in range(10):
        train_loss = train_model(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch + 1}, Train Loss: {train_loss:.4f}")
    test_loss = evaluate_model(model, test_loader, criterion, device)
    results[name] = test_loss

print("Model Comparison:")
for model_name, loss in results.items():
    print(f"{model_name}: Test Loss = {loss:.4f}")

In [None]:
print(train_data['ENG'].head())
print(train_data['JPN'].head())
print(type(train_data['ENG'].iloc[0]))
print(type(train_data['JPN'].iloc[0]))

---

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Dataset class
class TranslationDataset(Dataset):
    def __init__(self, eng_data, jpn_data):
        self.eng_data = torch.tensor(eng_data, dtype=torch.long)
        self.jpn_data = torch.tensor(jpn_data, dtype=torch.long)

    def __len__(self):
        return len(self.eng_data)

    def __getitem__(self, idx):
        return self.eng_data[idx], self.jpn_data[idx]

class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, num_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, tgt, hidden, cell):
        tgt = tgt.unsqueeze(1)  # Add time dimension
        embedded = self.embedding(tgt)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

# Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        # First input to the decoder is the <START> token
        input_token = tgt[:, 0]

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            outputs[:, t, :] = output
            # Use teacher forcing
            top1 = output.argmax(1)
            input_token = tgt[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs



In [None]:
import torch
import torch.nn as nn
import math

class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, num_heads, num_layers, dropout):
        super(TransformerSeq2Seq, self).__init__()
        self.embedding_src = nn.Embedding(input_dim, embed_dim)
        self.embedding_tgt = nn.Embedding(output_dim, embed_dim)
        self.positional_encoding = self._generate_positional_encoding(embed_dim)
        self.transformer = nn.Transformer(
            d_model=embed_dim,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dropout=dropout
        )
        self.fc_out = nn.Linear(embed_dim, output_dim)

    def _generate_positional_encoding(self, embed_dim, max_len=5000):
        pos_encoding = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * -(math.log(10000.0) / embed_dim))
        pos_encoding[:, 0::2] = torch.sin(position * div_term)
        pos_encoding[:, 1::2] = torch.cos(position * div_term)
        return pos_encoding.unsqueeze(0)

    def forward(self, src, tgt):
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)

        src = self.embedding_src(src) + self.positional_encoding[:, :src_seq_len, :]
        tgt = self.embedding_tgt(tgt) + self.positional_encoding[:, :tgt_seq_len, :]

        src = src.permute(1, 0, 2)  # (seq_len, batch, embed_dim)
        tgt = tgt.permute(1, 0, 2)  # (seq_len, batch, embed_dim)

        outputs = self.transformer(src, tgt)
        outputs = outputs.permute(1, 0, 2)  # (batch, seq_len, embed_dim)

        return self.fc_out(outputs)

# Example Usage
INPUT_DIM = 1000
OUTPUT_DIM = 1000
EMBED_DIM = 256
NUM_HEADS = 8
NUM_LAYERS = 3
DROPOUT = 0.1

model = TransformerSeq2Seq(INPUT_DIM, OUTPUT_DIM, EMBED_DIM, NUM_HEADS, NUM_LAYERS, DROPOUT)

src = torch.randint(0, INPUT_DIM, (32, 10))  # (batch, src_len)
tgt = torch.randint(0, OUTPUT_DIM, (32, 10))  # (batch, tgt_len)

outputs = model(src, tgt)
print(outputs.shape)  # (batch, tgt_len, output_dim)


In [None]:
import torch
import torch.nn as nn

class CNNEncoder(nn.Module):
    def __init__(self, input_dim, embed_dim, kernel_size, num_channels):
        super(CNNEncoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.conv = nn.Conv1d(
            in_channels=embed_dim,
            out_channels=num_channels,
            kernel_size=kernel_size,
            padding=(kernel_size - 1) // 2
        )
        self.relu = nn.ReLU()

    def forward(self, src):
        embedded = self.embedding(src).permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        conv_out = self.conv(embedded)
        return self.relu(conv_out.permute(0, 2, 1))  # (batch, seq_len, num_channels)

class CNNDecoder(nn.Module):
    def __init__(self, output_dim, embed_dim, kernel_size, num_channels):
        super(CNNDecoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.conv = nn.Conv1d(
            in_channels=num_channels + embed_dim,
            out_channels=num_channels,
            kernel_size=kernel_size,
            padding=(kernel_size - 1) // 2
        )
        self.fc = nn.Linear(num_channels, output_dim)
        self.relu = nn.ReLU()

    def forward(self, tgt, encoder_outputs):
        embedded = self.embedding(tgt).permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        decoder_input = torch.cat([encoder_outputs.permute(0, 2, 1), embedded], dim=1)
        conv_out = self.conv(decoder_input)
        conv_out = self.relu(conv_out.permute(0, 2, 1))  # (batch, seq_len, num_channels)
        return self.fc(conv_out)

class Seq2SeqCNN(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2SeqCNN, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        encoder_outputs = self.encoder(src)
        outputs = self.decoder(tgt, encoder_outputs)
        return outputs

# Example Usage
INPUT_DIM = 1000
OUTPUT_DIM = 1000
EMBED_DIM = 256
KERNEL_SIZE = 3
NUM_CHANNELS = 256

encoder = CNNEncoder(INPUT_DIM, EMBED_DIM, KERNEL_SIZE, NUM_CHANNELS)
decoder = CNNDecoder(OUTPUT_DIM, EMBED_DIM, KERNEL_SIZE, NUM_CHANNELS)
model = Seq2SeqCNN(encoder, decoder)

src = torch.randint(0, INPUT_DIM, (32, 10))  # (batch, src_len)
tgt = torch.randint(0, OUTPUT_DIM, (32, 10))  # (batch, tgt_len)

outputs = model(src, tgt)
print(outputs.shape)  # (batch, tgt_len, output_dim)


In [None]:
# Hyperparameters
INPUT_DIM = len(eng_vocab)
OUTPUT_DIM = len(jpn_vocab)
EMBED_DIM = 100 #256
HIDDEN_DIM = 100 #512
NUM_LAYERS = 2
DROPOUT = 0.5
LEARNING_RATE = 1e-3
EPOCHS = 10
BATCH_SIZE = 64
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize encoder, decoder, and model
encoder = Encoder(INPUT_DIM, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=SPECIAL_TOKENS['<PAD>'])

# Prepare dataset and dataloader
train_dataset = TranslationDataset(train_data['ENG'], train_data['JPN'])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Training loop
for epoch in range(EPOCHS):
    print(f'Running {epoch+1}')
    model.train()
    epoch_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)

        optimizer.zero_grad()
        output = model(src, tgt)

        # Reshape outputs and targets
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {epoch_loss/len(train_loader):.4f}')


In [None]:
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(DEVICE), tgt.to(DEVICE)
            output = model(src, tgt, teacher_forcing_ratio=0)

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)

            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


---

Model

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers=1, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=SPECIAL_TOKENS['<PAD>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers=1, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim, padding_idx=SPECIAL_TOKENS['<PAD>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)  # Add batch dimension
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell
    
### MODEL ###

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.embedding.num_embeddings

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        input = tgt[:, 0]  # <SOS>

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output
            top1 = output.argmax(1)
            input = tgt[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

### END OF MODEL ###

def train(model, dataloader, optimizer, criterion, device, clip=1):
    model.train()
    epoch_loss = 0

    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        
        output = model(src, tgt)
        output_dim = output.shape[-1]

        # Remove <SOS> token for target
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

def test(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, teacher_forcing_ratio=0)  # No teacher forcing
            output_dim = output.shape[-1]

            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)

            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


class TranslationDataset(Dataset):
    def __init__(self, data):
        self.src = torch.tensor(data['ENG'].tolist(), dtype=torch.long)
        self.tgt = torch.tensor(data['JPN'].tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]


In [None]:
# Hyperparameters
INPUT_DIM = len(eng_vocab)
OUTPUT_DIM = len(jpn_vocab)
EMBEDDING_DIM = 20 #50
HIDDEN_DIM = 16
N_LAYERS = 2
DROPOUT = 0.5
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = DEVICE
N_EPOCHS = 5
CLIP = 1
PAD_IDX = SPECIAL_TOKENS['<PAD>']

# init model
encoder = Encoder(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)
# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.AdamW(model.parameters())

# load data into a dataloader
train_dataset = TranslationDataset(train_data)
dev_dataset = TranslationDataset(dev_data)
'''train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=32)'''
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=32)


# training #
for epoch in range(N_EPOCHS):
    print(epoch)
    train_loss = train(model, train_loader, optimizer, criterion, device, CLIP)
    dev_loss = test(model, dev_loader, criterion, device)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {torch.exp(torch.tensor(train_loss)):.2f}')
    print(f'\tVal Loss: {dev_loss:.3f} | Val PPL: {torch.exp(torch.tensor(dev_loss)):.2f}')


In [None]:
# Save model
torch.save(model.state_dict(), 'seq2seq_model.pt')

# Load model
model.load_state_dict(torch.load('seq2seq_model.pt'))
model.eval()

In [None]:
def translate_sentence(model, sentence, eng_vocab, jpn_idx2word, max_len=MAX_LEN):
    model.eval()
    tokens = [eng_vocab.get(word, eng_vocab['<UNKNOWN>']) for word in word_tokenize(sentence)]
    src_tensor = torch.tensor([tokens + [PAD_IDX] * (max_len - len(tokens))], dtype=torch.long).to(DEVICE)

    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)
        input = torch.tensor([SPECIAL_TOKENS['<START>']], dtype=torch.long).to(DEVICE)
        translated_sentence = []

        for _ in range(max_len):
            output, hidden, cell = model.decoder(input, hidden, cell)
            top1 = output.argmax(1)
            if top1.item() == SPECIAL_TOKENS['<END>']:
                break
            translated_sentence.append(jpn_idx2word[top1.item()])
            input = top1

    return ' '.join(translated_sentence)

# Example
sentence = "The cat runs, the dog barks"
print(translate_sentence(model, sentence, eng_vocab, jpn_idx2word))
