In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Version 3

In [108]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import wandb

In [109]:
class TextConversionCorpus(Dataset):
    def __init__(self, data_file, source_dictionary=None, target_dictionary=None, sequence_limit=32, build_dictionary=False):
        # Try to load the data file with romanized and native text
        try:
            df = pd.read_csv(data_file, sep='\t', header=None,
                          names=['original', 'transliterated', 'frequency'],
                          usecols=[0, 1], dtype=str)
            print(f"Successfully loaded {len(df)} entries from {data_file}")

            # Handle missing values
            df['original'] = df['original'].fillna('')
            df['transliterated'] = df['transliterated'].fillna('')

            # Create pairs for training
            self.entry_pairs = list(zip(df['transliterated'], df['original']))
            print(f"Sample entries: {self.entry_pairs[:2]}")
        except Exception as e:
            print(f"Data loading error: {e}")
            self.entry_pairs = [('', '')]  # Default empty entry
            
        self.sequence_limit = sequence_limit
        
        # Setup dictionaries for conversion
        if build_dictionary:
            self.source_dictionary = {'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3}
            self.target_dictionary = {'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3}
            self._build_dictionaries()
        else:
            self.source_dictionary, self.target_dictionary = source_dictionary, target_dictionary
            # Ensure special tokens exist
            if '<eos>' not in self.source_dictionary:
                self.source_dictionary['<eos>'] = len(self.source_dictionary)
            if '<eos>' not in self.target_dictionary:
                self.target_dictionary['<eos>'] = len(self.target_dictionary)
    
    def _build_dictionaries(self):
        # Create character-level dictionaries from the dataset
        for source_text, target_text in self.entry_pairs:
            for character in source_text:
                if character not in self.source_dictionary:
                    self.source_dictionary[character] = len(self.source_dictionary)
            for character in target_text:
                if character not in self.target_dictionary:
                    self.target_dictionary[character] = len(self.target_dictionary)
        print(f"Dictionary sizes — Source: {len(self.source_dictionary)}, Target: {len(self.target_dictionary)}")
    
    def __len__(self):
        return len(self.entry_pairs)
    
    def __getitem__(self, index):
        source_text, target_text = self.entry_pairs[index]
        
        # Convert source text to indices
        source_indices = [self.source_dictionary['<bos>']]  # Begin with start token
        for char in source_text:
            idx = self.source_dictionary.get(char, self.source_dictionary['<unk>'])
            if idx >= len(self.source_dictionary):
                idx = self.source_dictionary['<unk>']  # Safety check
            source_indices.append(idx)
        
        # Convert target text to indices
        target_indices = [self.target_dictionary['<bos>']]  # Begin with start token
        for char in target_text:
            idx = self.target_dictionary.get(char, self.target_dictionary['<unk>'])
            if idx >= len(self.target_dictionary):
                idx = self.target_dictionary['<unk>']  # Safety check
            target_indices.append(idx)
        
        # Add end tokens
        source_indices.append(self.source_dictionary['<eos>'])
        target_indices.append(self.target_dictionary['<eos>'])
        
        # Add padding
        src_padding = [self.source_dictionary['<pad>']] * max(0, self.sequence_limit - len(source_indices))
        tgt_padding = [self.target_dictionary['<pad>']] * max(0, self.sequence_limit - len(target_indices))
        
        # Truncate if needed and convert to tensor
        source_indices = (source_indices + src_padding)[:self.sequence_limit]
        target_indices = (target_indices + tgt_padding)[:self.sequence_limit]
        
        # Validate padding index
        assert self.source_dictionary['<pad>'] < len(self.source_dictionary), "Source padding index out of range"
        assert self.target_dictionary['<pad>'] < len(self.target_dictionary), "Target padding index out of range"
        
        return torch.tensor(source_indices, dtype=torch.long), torch.tensor(target_indices, dtype=torch.long)


In [110]:
class SourceProcessor(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, layers_count, dropout_rate=0.0, architecture_type='GRU'):
        super().__init__()
        self.char_embeddings = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.hidden_size = hidden_size
        self.layers_count = layers_count
        self.architecture_type = architecture_type
        
        # Select the recurrent architecture
        if architecture_type == 'SimpleRNN':
            rnn_class = nn.RNN
        elif architecture_type == 'GRU':
            rnn_class = nn.GRU
        elif architecture_type == 'LSTM':
            rnn_class = nn.LSTM
        else:
            raise ValueError(f"Unsupported architecture: {architecture_type}")
        
        self.processor = rnn_class(
            embed_size,
            hidden_size,
            num_layers=layers_count,
            batch_first=True,
            dropout=dropout_rate if layers_count > 1 else 0.0
        )
    
    def forward(self, input_seq):
        """
        Process input sequence through the encoder
        
        Args:
            input_seq: Source sequence [batch_size, seq_length]
        
        Returns:
            full_output: All states [batch_size, seq_length, hidden_size]
            final_state: Final states [layers_count, batch_size, hidden_size]
        """
        # Generate embeddings
        embedded_chars = self.char_embeddings(input_seq)
        
        # Process through RNN
        full_output, final_state = self.processor(embedded_chars)
        
        return full_output, final_state


class TargetGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, layers_count, dropout_rate=0.0, architecture_type='GRU'):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.layers_count = layers_count
        self.architecture_type = architecture_type
        
        self.char_embeddings = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        
        # Select the recurrent architecture
        if architecture_type == 'SimpleRNN':
            rnn_class = nn.RNN
        elif architecture_type == 'GRU':
            rnn_class = nn.GRU
        elif architecture_type == 'LSTM':
            rnn_class = nn.LSTM
        else:
            raise ValueError(f"Unsupported architecture: {architecture_type}")
        
        self.processor = rnn_class(
            embed_size,
            hidden_size,
            num_layers=layers_count,
            batch_first=True,
            dropout=dropout_rate if layers_count > 1 else 0.0
        )
        
        # Projection to vocabulary
        self.output_mapper = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, token_index, prev_state):
        """
        Process a single decoder step
        
        Args:
            token_index: Current token indices [batch_size, 1]
            prev_state: Previous hidden state
            
        Returns:
            output_probs: Output probabilities [batch_size, vocab_size]
            new_state: Updated hidden state
        """
        # Get embeddings for the current token
        embedded_token = self.char_embeddings(token_index)
        
        # Process through RNN
        output, new_state = self.processor(embedded_token, prev_state)
        
        # Project output to vocabulary size
        output_probs = self.output_mapper(output.squeeze(1))
        
        return output_probs, new_state


class TransliterationSystem(nn.Module):
    def __init__(self, params, input_vocab_size, output_vocab_size):
        super().__init__()
        
        # Validate vocabulary sizes
        assert input_vocab_size > 0, f"Invalid input vocabulary size: {input_vocab_size}"
        assert output_vocab_size > 0, f"Invalid output vocabulary size: {output_vocab_size}"
        
        # Store configuration parameters
        self.embedding_dim = params.embedding_size
        self.internal_dim = params.internal_size
        self.rnn_type = params.rnn_architecture
        self.encoder_depth = params.encoder_depth
        self.decoder_depth = params.decoder_depth
        
        # Initialize encoder and decoder components
        self.encoder = SourceProcessor(
            input_vocab_size,
            params.embedding_size,
            params.internal_size,
            params.encoder_depth,
            params.dropout_prob,
            params.rnn_architecture
        )
        
        self.decoder = TargetGenerator(
            output_vocab_size,
            params.embedding_size,
            params.internal_size,
            params.decoder_depth,
            params.dropout_prob,
            params.rnn_architecture
        )
        
        # Store embeddings for access during training/inference
        self.source_embeddings = self.encoder.char_embeddings
        self.target_embeddings = self.decoder.char_embeddings
        
        print(f"Model initialized: {self.rnn_type}, Encoder depth: {self.encoder_depth}, "
              f"Decoder depth: {self.decoder_depth}, Embeddings: {self.embedding_dim}, "
              f"Hidden size: {self.internal_dim}")
              
    def forward(self, source_sequence, target_sequence):
        """
        Forward pass through the complete transliteration model
        
        Args:
            source_sequence: Source text indices [batch_size, src_len]
            target_sequence: Target text indices [batch_size, tgt_len]
            
        Returns:
            predictions: Output probabilities [batch_size, tgt_len-1, output_vocab_size]
        """
        batch_size, src_len = source_sequence.size()
        tgt_len = target_sequence.size(1)
        device = source_sequence.device
        
        # Check for out-of-bounds indices and clamp if needed
        if source_sequence.max() >= self.source_embeddings.num_embeddings:
            print("Warning: Source indices out of vocabulary range")
            source_sequence = torch.clamp(source_sequence, 0, self.source_embeddings.num_embeddings - 1)
        if target_sequence.max() >= self.target_embeddings.num_embeddings:
            print("Warning: Target indices out of vocabulary range")
            target_sequence = torch.clamp(target_sequence, 0, self.target_embeddings.num_embeddings - 1)
            
        try:
            # Process source sequence
            _, encoder_state = self.encoder(source_sequence)
            
            # Handle mismatch in number of layers between encoder and decoder
            if self.encoder_depth != self.decoder_depth:
                # If encoder and decoder depths are different, need to adjust the hidden state
                if isinstance(encoder_state, tuple):  # For LSTM (hidden state, cell state)
                    # Adjust both hidden state and cell state
                    h_state, c_state = encoder_state
                    if self.encoder_depth > self.decoder_depth:
                        # If encoder has more layers, take only what decoder needs
                        decoder_h = h_state[-self.decoder_depth:]
                        decoder_c = c_state[-self.decoder_depth:]
                        decoder_state = (decoder_h, decoder_c)
                    else:
                        # If decoder has more layers, replicate encoder's last layer
                        decoder_h = torch.cat([h_state, h_state[-1:].repeat(self.decoder_depth - self.encoder_depth, 1, 1)], 0)
                        decoder_c = torch.cat([c_state, c_state[-1:].repeat(self.decoder_depth - self.encoder_depth, 1, 1)], 0)
                        decoder_state = (decoder_h, decoder_c)
                else:  # For GRU/RNN (just hidden state)
                    if self.encoder_depth > self.decoder_depth:
                        # If encoder has more layers, take only what decoder needs
                        decoder_state = encoder_state[-self.decoder_depth:]
                    else:
                        # If decoder has more layers, replicate encoder's last layer
                        decoder_state = torch.cat([encoder_state, 
                                                  encoder_state[-1:].repeat(self.decoder_depth - self.encoder_depth, 1, 1)], 0)
            else:
                # If depths match, use encoder state directly
                decoder_state = encoder_state
            
            # Teacher forcing: use target tokens as inputs
            decoder_inputs = target_sequence[:, :-1]  # From <bos> to second-to-last token
            
            # Create tensor for outputs
            outputs = torch.zeros(batch_size, tgt_len-1, self.decoder.vocab_size, device=device)
            
            # Process each token sequentially
            for t in range(tgt_len-1):
                # Get current token
                current_token = target_sequence[:, t].unsqueeze(1)
                
                # Process through decoder
                decoder_output, decoder_state = self.decoder(
                    current_token,
                    decoder_state
                )
                
                # Store prediction
                outputs[:, t, :] = decoder_output
                
            return outputs
            
        except Exception as error:
            print(f"Error in forward pass: {error}")
            return torch.zeros(batch_size, tgt_len-1, self.decoder.vocab_size, device=device)
            
    def generate(self, source_sequence, max_length=50, beam_width=1):
        """
        Generate transliteration using search methods
        
        Args:
            source_sequence: Source sequence [batch_size, src_len]
            max_length: Maximum generation length
            beam_width: Width for beam search (1 = greedy)
            
        Returns:
            generated_text: Generated sequence [batch_size, max_length]
        """
        batch_size = source_sequence.size(0)
        device = source_sequence.device
        
        # Encode source sequence
        _, encoder_state = self.encoder(source_sequence)
        
        # Handle mismatch in number of layers between encoder and decoder
        if self.encoder_depth != self.decoder_depth:
            # If encoder and decoder depths are different, need to adjust the hidden state
            if isinstance(encoder_state, tuple):  # For LSTM (hidden state, cell state)
                # Adjust both hidden state and cell state
                h_state, c_state = encoder_state
                if self.encoder_depth > self.decoder_depth:
                    # If encoder has more layers, take only what decoder needs
                    decoder_h = h_state[-self.decoder_depth:]
                    decoder_c = c_state[-self.decoder_depth:]
                    decoder_state = (decoder_h, decoder_c)
                else:
                    # If decoder has more layers, replicate encoder's last layer
                    decoder_h = torch.cat([h_state, h_state[-1:].repeat(self.decoder_depth - self.encoder_depth, 1, 1)], 0)
                    decoder_c = torch.cat([c_state, c_state[-1:].repeat(self.decoder_depth - self.encoder_depth, 1, 1)], 0)
                    decoder_state = (decoder_h, decoder_c)
            else:  # For GRU/RNN (just hidden state)
                if self.encoder_depth > self.decoder_depth:
                    # If encoder has more layers, take only what decoder needs
                    decoder_state = encoder_state[-self.decoder_depth:]
                else:
                    # If decoder has more layers, replicate encoder's last layer
                    decoder_state = torch.cat([encoder_state, 
                                              encoder_state[-1:].repeat(self.decoder_depth - self.encoder_depth, 1, 1)], 0)
        else:
            # If depths match, use encoder state directly
            decoder_state = encoder_state
        
        if beam_width == 1:
            # Use greedy search
            decoder_token = torch.tensor([[2]], device=device).repeat(batch_size, 1)  # <bos> token
            output_sequence = torch.zeros(batch_size, max_length, dtype=torch.long, device=device)
            
            for t in range(max_length):
                # Process through decoder
                token_probs, decoder_state = self.decoder(
                    decoder_token,
                    decoder_state
                )
                
                # Select most likely token
                _, token_idx = token_probs.topk(1)
                decoder_token = token_idx.view(batch_size, 1)
                
                # Store generated token
                output_sequence[:, t] = decoder_token.squeeze(1)
                
                # Check for end-of-sequence
                if (decoder_token == 3).all():  # 3 is <eos> token
                    break
                    
            return output_sequence
        else:
            # Placeholder for beam search implementation
            # Currently falls back to greedy search
            return self.generate(source_sequence, max_length, beam_width=1)

In [111]:
def calculate_performance(logits, reference, pad_idx=0):
    """
    Calculate character accuracy excluding padding tokens
    """
    predictions = logits.argmax(dim=-1)
    valid_positions = reference != pad_idx
    correct_chars = (predictions == reference) & valid_positions
    performance = correct_chars.sum().item() / max(valid_positions.sum().item(), 1)
    return performance

In [112]:
def training_iteration(model, dataloader, loss_function, optimizer, compute_device):
    """Run one training iteration"""
    model.train()
    total_loss = 0.0
    total_performance = 0.0
    total_batches = len(dataloader)
    successful_batches = 0
    
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        try:
            inputs = inputs.to(compute_device)
            targets = targets.to(compute_device)
            
            # Check for vocabulary issues
            if inputs.max().item() >= model.source_embeddings.num_embeddings or \
               targets.max().item() >= model.target_embeddings.num_embeddings:
                print(f"Skipping batch {batch_idx}/{total_batches} - vocabulary index issues detected")
                continue
                
            # Clear gradients
            optimizer.zero_grad()
            
            # Forward pass
            predictions = model(inputs, targets)
            
            # Reshape for loss calculation
            flat_predictions = predictions.reshape(-1, predictions.size(-1))
            flat_targets = targets[:, 1:].reshape(-1)  # Offset for teacher forcing
            
            # Calculate loss and backpropagate
            batch_loss = loss_function(flat_predictions, flat_targets)
            batch_loss.backward()
            
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Update weights
            optimizer.step()
            
            # Track metrics
            total_loss += batch_loss.item()
            
            # Calculate accuracy
            batch_accuracy = calculate_performance(predictions, targets[:, 1:])
            total_performance += batch_accuracy
            successful_batches += 1
            
        except Exception as error:
            print(f"Error processing batch {batch_idx}/{total_batches}: {error}")
            continue
    
    if successful_batches == 0:
        return 0.0, 0.0
    return total_loss / successful_batches, total_performance / successful_batches


In [113]:
def validation_check(model, dataloader, loss_function, compute_device):
    """Evaluate model on validation data"""
    model.eval()
    total_loss = 0.0
    total_performance = 0.0
    successful_batches = 0
    
    with torch.no_grad():
        for inputs, targets in dataloader:
            try:
                inputs = inputs.to(compute_device)
                targets = targets.to(compute_device)
                
                # Check for vocabulary issues
                if inputs.max() >= model.source_embeddings.num_embeddings or \
                   targets.max() >= model.target_embeddings.num_embeddings:
                    continue
                
                # Forward pass
                predictions = model(inputs, targets)
                
                # Reshape for loss calculation
                flat_predictions = predictions.reshape(-1, predictions.size(-1))
                flat_targets = targets[:, 1:].reshape(-1)  # Offset for teacher forcing
                
                # Calculate loss
                batch_loss = loss_function(flat_predictions, flat_targets)
                total_loss += batch_loss.item()
                
                # Calculate accuracy
                batch_accuracy = calculate_performance(predictions, targets[:, 1:])
                total_performance += batch_accuracy
                successful_batches += 1
                
            except Exception as error:
                print(f"Evaluation error: {error}")
                continue
    
    if successful_batches == 0:
        return 0.0, 0.0
    return total_loss / successful_batches, total_performance / successful_batches


In [114]:
hyperparameter_search = {
    'method': 'bayes',
    'metric': {'name': 'validation_accuracy', 'goal': 'maximize'},
    'parameters': {
        'embedding_size': {'values': [16, 32, 64, 256]},
        'internal_size': {'values': [16, 32, 64, 256]},
        'rnn_architecture': {'values': ['SimpleRNN', 'GRU', 'LSTM']},
        'encoder_depth': {'values': [1, 2, 3]},
        'decoder_depth': {'values': [1, 2, 3]},
        'dropout_prob': {'values': [0.2, 0.3]},
        'learning_rate': {'values': [1e-3, 1e-4]},
        'batch_size': {'values': [32, 64]},
        'beam_width': {'values': [1, 3, 5]}
    }
}

In [115]:
def save_dictionaries(output_dir, source_dict, target_dict):
    """Save source and target dictionaries to JSON files"""
    os.makedirs(output_dir, exist_ok=True)
    
    source_path = os.path.join(output_dir, 'source_dict.json')
    target_path = os.path.join(output_dir, 'target_dict.json')
    
    with open(source_path, 'w', encoding='utf-8') as source_file:
        json.dump(source_dict, source_file, indent=2, ensure_ascii=False)
        
    with open(target_path, 'w', encoding='utf-8') as target_file:
        json.dump(target_dict, target_file, indent=2, ensure_ascii=False)

def load_dictionaries(directory):
    """Load source and target dictionaries from JSON files"""
    with open(os.path.join(directory, 'source_dict.json'), 'r', encoding='utf-8') as source_file:
        source_dict = json.load(source_file)
        
    with open(os.path.join(directory, 'target_dict.json'), 'r', encoding='utf-8') as target_file:
        target_dict = json.load(target_file)
        
    return source_dict, target_dict


In [116]:
train_file = '/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.train.tsv'
dev_file = '/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.dev.tsv'
dict_dir = '/kaggle/working/dictionaries'

print("Building dictionaries...")
train_corpus = TextConversionCorpus(train_file, build_dictionary=True)
source_dict, target_dict = train_corpus.source_dictionary, train_corpus.target_dictionary

save_dictionaries(dict_dir, source_dict, target_dict)
print(f"Source dictionary: {len(source_dict)}, Target dictionary: {len(target_dict)}")

# Display dictionary samples
print("Source dictionary sample:")
for char, idx in list(source_dict.items())[:10]:
    print(f"  {repr(char)} => {idx}")
print("Target dictionary sample:")
for char, idx in list(target_dict.items())[:10]:
    print(f"  {repr(char)} => {idx}")

Building dictionaries...
Successfully loaded 56303 entries from /kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.train.tsv
Sample entries: [('angry', 'अँग्री'), ('aengeography', 'अँजिओग्राफी')]
Dictionary sizes — Source: 30, Target: 69
Source dictionary: 30, Target dictionary: 69
Source dictionary sample:
  '<pad>' => 0
  '<unk>' => 1
  '<bos>' => 2
  '<eos>' => 3
  'a' => 4
  'n' => 5
  'g' => 6
  'r' => 7
  'y' => 8
  'e' => 9
Target dictionary sample:
  '<pad>' => 0
  '<unk>' => 1
  '<bos>' => 2
  '<eos>' => 3
  'अ' => 4
  'ँ' => 5
  'ग' => 6
  '्' => 7
  'र' => 8
  'ी' => 9


In [117]:
def execute_hyperparameter_search():
    """Function to execute for each hyperparameter configuration"""
    experiment = wandb.init()
    config = experiment.config
        
    # Create descriptive experiment name
    experiment.name = f"{config.rnn_architecture}-e{config.embedding_size}-h{config.internal_size}-enc{config.encoder_depth}-dec{config.decoder_depth}-d{config.dropout_prob}-lr{config.learning_rate}-b{config.batch_size}-beam{config.beam_width}"
        
    # Setup hardware
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    training_epochs = 20
    
    # Initialize the model
    try:
        # Create on CPU first
        model = TransliterationSystem(config, len(source_dict), len(target_dict))
        print("Model created, moving to target device...")
        # Move to target device
        model = model.to(device)
        print(f"Model successfully loaded on {device}.")
    except Exception as e:
        print(f"Error initializing model on {device}: {e}")
        print("Falling back to CPU...")
        device = torch.device('cpu')
        model = TransliterationSystem(config, len(source_dict), len(target_dict)).to(device)
        
    # Setup datasets
    try:
        print("Preparing datasets...")
        train_corpus = TextConversionCorpus(train_file, source_dict, target_dict)
        dev_corpus = TextConversionCorpus(dev_file, source_dict, target_dict)
            
        # Create data loaders
        train_loader = DataLoader(train_corpus, batch_size=config.batch_size, shuffle=True)
        dev_loader = DataLoader(dev_corpus, batch_size=config.batch_size)
    except Exception as e:
        print(f"Dataset preparation error: {e}")
        return
        
    # Setup training components
    try:
        loss_function = nn.CrossEntropyLoss(ignore_index=0)  # 0 is padding index
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    except Exception as e:
        print(f"Error setting up training components: {e}")
        return
        
    # Training loop
    best_validation_accuracy = 0.0
        
    try:
        for epoch in range(training_epochs):
            print(f"Epoch {epoch+1}/{training_epochs}")
                
            # Train
            train_loss, train_accuracy = training_iteration(model, train_loader, loss_function, optimizer, device)
            print(f"Training - Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}")
                
            # Validate
            val_loss, val_accuracy = validation_check(model, dev_loader, loss_function, device)
            print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
                
            # Log metrics
            wandb.log({
                'training_loss': train_loss, 
                'validation_loss': val_loss, 
                'training_accuracy': train_accuracy, 
                'validation_accuracy': val_accuracy, 
                'epoch': epoch
            })
                
            # Track best model
            if val_accuracy > best_validation_accuracy:
                best_validation_accuracy = val_accuracy
                
        # Log final best accuracy
        wandb.log({'validation_accuracy': best_validation_accuracy})
                    
    except Exception as e:
        print(f"Training error: {e}")

In [118]:
wandb.login(key = "d6f8c99f1fd73267470842bbf00f03ae845f7308")
# sweep_id = wandb.sweep(hyperparameter_search, project='DLA3')
# wandb.agent(sweep_id, execute_hyperparameter_search, count=20)



True

Testing

In [119]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from torch.utils.data import DataLoader
import os
from tqdm import tqdm
import csv
from IPython.display import display, HTML

In [120]:
path_test = '/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.test.tsv'

In [121]:
test_dataset = TextConversionCorpus(path_test, source_dict, target_dict)
test_loader = DataLoader(test_dataset, batch_size=1)

Successfully loaded 5682 entries from /kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.test.tsv
Sample entries: [('andarasan', 'अँडरसन'), ('andarasana', 'अँडरसन')]


In [122]:
def inference_without_teacher_forcing(model, src, max_len=100, device=None):
    """
    Generate sequence without teacher forcing
    
    Args:
        model: The seq2seq model
        src: Source sequence [batch_size, src_len]
        max_len: Maximum length to generate
        device: Device to run inference on
        
    Returns:
        outputs: Generated sequence [batch_size, max_len]
    """
    batch_size = src.size(0)
    if device is None:
        device = src.device
    
    # Initialize outputs tensor
    outputs = torch.zeros(batch_size, max_len, dtype=torch.long, device=device)
    
    # Encode source sequence
    _, encoder_state = model.encoder(src)
    
    # Handle mismatch in number of layers between encoder and decoder
    if model.encoder_depth != model.decoder_depth:
        # If encoder and decoder depths are different, need to adjust the hidden state
        if isinstance(encoder_state, tuple):  # For LSTM (hidden state, cell state)
            # Adjust both hidden state and cell state
            h_state, c_state = encoder_state
            if model.encoder_depth > model.decoder_depth:
                # If encoder has more layers, take only what decoder needs
                decoder_h = h_state[-model.decoder_depth:]
                decoder_c = c_state[-model.decoder_depth:]
                decoder_state = (decoder_h, decoder_c)
            else:
                # If decoder has more layers, replicate encoder's last layer
                decoder_h = torch.cat([h_state, h_state[-1:].repeat(model.decoder_depth - model.encoder_depth, 1, 1)], 0)
                decoder_c = torch.cat([c_state, c_state[-1:].repeat(model.decoder_depth - model.encoder_depth, 1, 1)], 0)
                decoder_state = (decoder_h, decoder_c)
        else:  # For GRU/RNN (just hidden state)
            if model.encoder_depth > model.decoder_depth:
                # If encoder has more layers, take only what decoder needs
                decoder_state = encoder_state[-model.decoder_depth:]
            else:
                # If decoder has more layers, replicate encoder's last layer
                decoder_state = torch.cat([encoder_state, 
                                          encoder_state[-1:].repeat(model.decoder_depth - model.encoder_depth, 1, 1)], 0)
    else:
        # If depths match, use encoder state directly
        decoder_state = encoder_state
    
    # Start with <bos> token (assuming token ID 2 is BOS)
    decoder_token = torch.full((batch_size, 1), 2, dtype=torch.long, device=device)
    
    for t in range(max_len):
        # Process through decoder
        # IMPORTANT: Ensure token_index has shape [batch_size, 1]
        decoder_output, decoder_state = model.decoder(
            decoder_token,
            decoder_state
        )
        
        # Get most likely token
        _, topi = decoder_output.topk(1)
        decoder_token = topi.view(batch_size, 1)  # Ensure shape is [batch_size, 1]
        
        # Save to outputs tensor
        outputs[:, t] = decoder_token.squeeze(1)
        
        # Check if all sequences generated EOS token (assuming token ID 3 is EOS)
        if (decoder_token == 3).all():
            break
    
    return outputs

In [123]:
def decode_seq(seq, char_vocab, end_token=3):
    """
    Decode a sequence of token indices to characters
    
    Args:
        seq: Sequence of indices
        char_vocab: Character vocabulary (char -> idx mapping)
        end_token: Token index representing end of sequence
        
    Returns:
        Decoded string
    """
    idx_to_char = {idx: ch for ch, idx in char_vocab.items()}
    result = []
    has_end_token = False
    for idx in seq:
        if idx == 0:  # Skip padding
            continue
        if idx == end_token:  # Stop at EOS
            has_end_token = True
            break
        if idx in idx_to_char:
            result.append(idx_to_char[idx])
        else:
            result.append('<UNK>')
    decoded = ''.join(result)
    return decoded


In [131]:
def evaluate_model(model, test_loader, src_vocab, tgt_vocab, device, end_token=3, unk_token=1, output_file='predictions_vanilla.csv'):
    """
    Evaluate model on test set and save results to CSV
    
    Args:
        model: The trained model
        test_loader: DataLoader for test dataset
        src_vocab: Source vocabulary (char -> idx mapping)
        tgt_vocab: Target vocabulary (char -> idx mapping)
        device: Device to run evaluation on
        end_token: Token index representing end of sequence (default=2)
        unk_token: Token index representing unknown token (default=1)
        output_file: Path to save CSV results
        
    Returns:
        List of results with input, prediction, target, and correctness
    """
    model.eval()
    results = []
    tgt_vocab_size = len(tgt_vocab)
    idx_to_char = {idx: ch for ch, idx in tgt_vocab.items()}
    print(f'Target vocabulary size: {len(tgt_vocab)}')
    print(f'Special tokens: PAD={0}, UNK={unk_token}, SOS={2}, EOS={end_token}')
    with torch.no_grad(), open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['input', 'prediction', 'target', 'correct'])
        for batch in tqdm(test_loader, desc='Evaluating'):
            src, tgt = batch
            src, tgt = (src.to(device), tgt.to(device))
            if hasattr(model, 'inference'):
                output = model.inference(src)
            else:
                output = inference_without_teacher_forcing(model, src, max_len=100, device=device)
            for i in range(src.size(0)):
                src_seq = src[i].cpu().numpy()
                if tgt[i, 0].item() == unk_token:
                    tgt_seq = tgt[i, 1:].cpu().numpy()
                else:
                    tgt_seq = tgt[i].cpu().numpy()
                pred_seq = output[i].cpu().numpy()
                src_str = decode_seq(src_seq, src_vocab, end_token)
                tgt_str = decode_seq(tgt_seq, tgt_vocab, end_token)
                pred_str = decode_seq(pred_seq, tgt_vocab, end_token)
                correct = pred_str == tgt_str[5:]
                writer.writerow([src_str[5:], pred_str, tgt_str[5:]])
                results.append({'input': src_str[5:], 'prediction': pred_str, 'target': tgt_str[5:], 'correct': correct})
    accuracy = sum((1 for r in results if r['correct'])) / len(results) if results else 0
    print(f'Overall Word Accuracy: {accuracy:.4f}')
    return (accuracy, results)

In [132]:
def generate_char_comparison_html(input_str, pred_str, target_str):
    """
    Generate HTML with character-by-character comparison
    """
    html = f'<p><b>Input:</b> {input_str}</p>'
    html += '<p><b>Prediction vs Target:</b> '
    max_len = max(len(pred_str), len(target_str))
    pred_str_padded = pred_str.ljust(max_len)
    target_str_padded = target_str.ljust(max_len)
    for p_char, t_char in zip(pred_str_padded, target_str_padded):
        if p_char == t_char:
            html += f'<span style="color:green">{p_char}</span>'
        else:
            html += f'<span style="color:red">{p_char}</span>'
    html += '</p>'
    html += f'<p><b>Target:</b> {target_str}</p>'
    return html

In [133]:
def visualize_errors(results, n_samples=10):
    """
    Visualize character-level errors for random samples
    """
    incorrect_samples = [r for r in results if not r['correct']]
    correct_samples = [r for r in results if r['correct']]
    if len(incorrect_samples) > 0 and len(correct_samples) > 0:
        n_incorrect = min(n_samples // 2, len(incorrect_samples))
        n_correct = n_samples - n_incorrect
        samples = random.sample(incorrect_samples, n_incorrect) + random.sample(correct_samples, min(n_correct, len(correct_samples)))
    else:
        samples = random.sample(results, min(n_samples, len(results)))
    html_output = '<h2>Character-level Error Visualization</h2>'
    html_output += '<p>Green: Correct characters, Red: Incorrect characters</p>'
    for i, sample in enumerate(samples):
        html_output += f'<h3>Sample {i + 1}</h3>'
        html_output += generate_char_comparison_html(sample['input'], sample['prediction'], sample['target'])
        html_output += '<hr>'
    return HTML(html_output)

In [134]:
api = wandb.Api()
sweep = api.sweep('da24m014-iit-madras/DLA3/sweeps/4a34r0cv')
best_run = sweep.best_run()
best_run_config = best_run.config

[34m[1mwandb[0m: Sorting runs by -summary_metrics.validation_accuracy


In [135]:
best_run_config

{'batch_size': 32,
 'beam_width': 5,
 'dropout_prob': 0.3,
 'decoder_depth': 3,
 'encoder_depth': 3,
 'internal_size': 256,
 'learning_rate': 0.001,
 'embedding_size': 16,
 'rnn_architecture': 'LSTM'}

In [136]:
def test_and_evaluate(config=None):
    run = wandb.init(config=config, project='DLA3')
    cfg = run.config
    epochs = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    run.name = 'Testing without Attention'
    model = TransliterationSystem(cfg, len(source_dict), len(target_dict))
    model = model.to(device)
    print('Model successfully moved to device.')
    print('Loading datasets...')
    train_dataset = TextConversionCorpus(train_file, source_dict, target_dict)
    dev_dataset = TextConversionCorpus(dev_file, source_dict, target_dict)
    train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=cfg.batch_size)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=cfg.learning_rate)
    best_val_acc = 0.0
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        train_loss, train_acc = training_iteration(model, train_loader, criterion, optimizer, device)
        print(f'Train loss: {train_loss:.4f} Train Accuracy: {train_acc:.4f}')
        val_loss, val_acc = validation_check(model, dev_loader, criterion, device)
        print(f'Validation loss: {val_loss:.4f} Val Accuracy: {val_acc:.4f}')
        wandb.log({'train_loss': train_loss, 'val_loss': val_loss, 'train_acc': train_acc, 'val_acc': val_acc, 'epoch': epoch})
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'model.pth')
    test_acc, results = evaluate_model(model, test_loader, source_dict, target_dict, device)
    wandb.log({'Test_acc': test_acc})
    display(visualize_errors(results, n_samples=10))

In [137]:
test_and_evaluate(best_run_config)

Model initialized: LSTM, Encoder depth: 3, Decoder depth: 3, Embeddings: 16, Hidden size: 256
Model successfully moved to device.
Loading datasets...
Successfully loaded 56303 entries from /kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.train.tsv
Sample entries: [('angry', 'अँग्री'), ('aengeography', 'अँजिओग्राफी')]
Successfully loaded 5658 entries from /kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons/mr.translit.sampled.dev.tsv
Sample entries: [('aendarsanla', 'अँडरसनला'), ('andersonla', 'अँडरसनला')]
Epoch 1/1
Train loss: 2.4234 Train Accuracy: 0.3341
Validation loss: 1.5620 Val Accuracy: 0.5588
Target vocabulary size: 69
Special tokens: PAD=0, UNK=1, SOS=2, EOS=3


Evaluating: 100%|██████████| 5682/5682 [00:33<00:00, 168.60it/s]

Overall Word Accuracy: 0.0084



