In [1]:
!pip install wandb
import numpy as np
import pandas as pd
import wandb
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm
wandb.login()
import csv
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [2]:
def read_and_split_data(train_file, test_file, val_file):
    # Read data
    train_data = pd.read_csv(train_file, header=None)
    test_data = pd.read_csv(test_file, header=None)
    val_data = pd.read_csv(val_file, header=None)

    # Split into English and Marathi words
    english_train = train_data.iloc[:, 0]
    marathi_train = train_data.iloc[:, 1]

    english_test = test_data.iloc[:, 0]
    marathi_test = test_data.iloc[:, 1]

    english_val = val_data.iloc[:, 0]
    marathi_val = val_data.iloc[:, 1]

    return (english_train, marathi_train, english_test, marathi_test, english_val, marathi_val)



In [3]:
# File paths
lang="mar"
train_file = f"/kaggle/input/aksharantar-sampled/aksharantar_sampled/{lang}/{lang}_train.csv"
test_file = f"/kaggle/input/aksharantar-sampled/aksharantar_sampled/{lang}/{lang}_test.csv"
val_file = f"/kaggle/input/aksharantar-sampled/aksharantar_sampled/{lang}/{lang}_valid.csv"

# Call the function
english_train, marathi_train, english_test, marathi_test, english_val, marathi_val = read_and_split_data(train_file, test_file, val_file)

In [4]:
def create_char_list(words):
    char_set = set(char for word in words for char in word)
    char_list = sorted(char_set)
    max_length_word = max(len(word) for word in words)
    return char_list, max_length_word


def find_max_length(word_list):
    max_length = -1
    for word in word_list:
        max_length = max(max_length, len(word))
    return max_length

# Create character lists and find maximum word lengths
english_chars, english_max_len = create_char_list(english_train)
marathi_chars, marathi_max_len = create_char_list(marathi_train)

# Find maximum word lengths from validation and test data
english_max_len = max(find_max_length(english_val), find_max_length(english_test), english_max_len)
marathi_max_len = max(find_max_length(marathi_val), find_max_length(marathi_test), marathi_max_len)

In [5]:
def word_to_vector(word, lang):
    max_len = -1
    if lang == "english":
        max_len = english_max_len
    else:
        max_len = marathi_max_len

    vector = [0] * (max_len + 2)  # Initialize vector with max length + 2 (for special tokens)
    vector[0] = len(english_chars) + 1 if lang == "english" else len(marathi_chars) + 1
    count=1
    if(lang == "english"):
        for char in word:
            for i in range(len(english_chars)):
                if(english_chars[i] == char):
                    vector[count]=i+1
                    count+=1
    else :
        for char in word:
            for i in range(len(marathi_chars)):
                if(marathi_chars[i] == char):
                    vector[count]=i+1
                    count+=1

    return vector


In [6]:
def indices_to_words(indices, language):
    words = []
    char_list = english_chars if language == "english" else marathi_chars
    for idx in range(len(indices)):
        if idx == 0:
            continue
        if indices[idx]==0:
            break
        char = char_list[indices[idx] - 1]  # Adjust for zero-indexing
        words.append(char)
    return ''.join(words)

In [7]:
# creating matrix of representation
def word_matrix(words, language):
    matrix = []
    for word in words:
        matrix.append(word_to_vector(word, language))
    return torch.tensor(matrix)

In [8]:
def prepare_word_matrices(train_data, val_data, test_data, language):
    train_matrix = word_matrix(train_data, language)
    val_matrix = word_matrix(val_data, language)
    test_matrix = word_matrix(test_data, language)
    return train_matrix, val_matrix, test_matrix

In [9]:
english_matrix, english_matrix_val, english_matrix_test = prepare_word_matrices(english_train, english_val, english_test, "english")
marathi_matrix, marathi_matrix_val, marathi_matrix_test = prepare_word_matrices(marathi_train, marathi_val, marathi_test, "marathi")

In [10]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers, batch_size, dropout_prob, bidirectional, cell_type):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.dropout = nn.Dropout(dropout_prob)
        self.biderectional = bidirectional
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.cell_type = cell_type

        rnn_class = nn.RNN if cell_type == "RNN" else (nn.LSTM if cell_type == "LSTM" else nn.GRU)
        self.rnn = rnn_class(embedding_dim, hidden_size, num_layers, dropout=dropout_prob, bidirectional=bidirectional)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        if self.cell_type == "LSTM":
            output, (hidden, cell) = self.rnn(embedded)
        else:
            output, hidden = self.rnn(embedded)

        return (output, hidden, cell) if self.cell_type == "LSTM" else (output, hidden)

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [11]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, dec_layers, p, cell_type, attention=False, bidirectional=False):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dec_layers = dec_layers
        self.dropout = nn.Dropout(p)
        self.cell_type = cell_type
        self.attention = attention
        self.bidirectional = bidirectional
        self.max_length = len(english_matrix[0])

        self.embedding = nn.Embedding(input_size, embedding_size)
        rnn_input_size = hidden_size if attention else embedding_size
        rnn_class = nn.RNN if cell_type == "RNN" else (nn.LSTM if cell_type == "LSTM" else nn.GRU)
        self.rnn = rnn_class(rnn_input_size, hidden_size, dec_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

        if attention:
            self.attn = nn.Linear(hidden_size + embedding_size, self.max_length)
            self.attn_combine = nn.Linear(hidden_size * 2 + embedding_size, hidden_size) if bidirectional else nn.Linear(hidden_size + embedding_size, hidden_size)

    def forward(self, x, output, hidden, cell=None):
        x = x.unsqueeze(0)
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)

        if self.attention:
            attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
            attn_applied = torch.bmm(attn_weights.unsqueeze(1), output.permute(1, 0, 2)).squeeze(1)
            op = torch.cat((embedded[0], attn_applied), 1)
            op = self.attn_combine(op).unsqueeze(0)
            op = F.relu(op)
        else:
            op = embedded

        if self.cell_type == "LSTM":
            outputs, (hidden, cell) = self.rnn(op, (hidden, cell))
        else:
            outputs, hidden = self.rnn(op, hidden)

        predictions, hidden, cell = self.generate_predictions(outputs, hidden, cell)

        return predictions, hidden, cell

    def generate_predictions(self, rnn_outputs, rnn_hidden, rnn_cell=None):
        output_predictions = self.fc(rnn_outputs)
        output_predictions = output_predictions.squeeze(0)

        return (output_predictions, rnn_hidden, rnn_cell) if self.cell_type == "LSTM" else (output_predictions, rnn_hidden ,rnn_cell)


    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [12]:
class Seq2SeqModel(nn.Module):
    def __init__(self, output_size, cell_type, bidirectional, enc_layers, dec_layers, encoder, decoder):
        super(Seq2SeqModel, self).__init__()
        self.output_size = output_size
        self.cell_type = cell_type
        self.bidirectional = bidirectional
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target , teacher_force_ratio=0.5):
        target_len = target.shape[0]
        batch_size = source.shape[1]

        outputs = torch.zeros(target_len, batch_size, self.output_size).to(source.device)

        encoder_output, hidden, cell = self.encode_sequence(source)
        hidden, cell = self.prepare_decoder_states(hidden, cell)
        outputs = self.decode_sequence(target, encoder_output, hidden, cell, teacher_force_ratio)

        return outputs

    def encode_sequence(self, source):
        if self.cell_type == "LSTM":
            encoder_output, hidden, cell = self.encoder(source)
            return encoder_output, hidden, cell
        else:
            encoder_output, hidden = self.encoder(source)
            return encoder_output, hidden, None

    def prepare_decoder_states(self, hidden, cell):
        if self.bidirectional or self.enc_layers != self.dec_layers:
          hidden = hidden[self.enc_layers - 1] + hidden[self.enc_layers - 1]
          hidden = hidden.repeat(self.dec_layers,1,1)
          if(self.cell_type == "LSTM"):
              cell = cell[self.enc_layers - 1] + cell[self.enc_layers - 1]
              cell = cell.repeat(self.dec_layers,1,1)
        return hidden, cell


    def decode_sequence(self, tgt, enc_out, hid, cell, teacher_force_ratio):
        batch_size = tgt.shape[1]
        target_len = tgt.shape[0]
        outputs = torch.zeros(target_len, batch_size, self.output_size).to(enc_out.device)

        timestep = 1
        current_token = tgt[0]

        while timestep < target_len:
            if self.cell_type == "LSTM":
                output, hid, cell = self.decoder(current_token, enc_out, hid, cell)
            else:
                output, hid, cell = self.decoder(current_token, enc_out, hid)
            outputs[timestep] = output

            if random.random() < teacher_force_ratio:
                current_token = tgt[timestep] if timestep < target_len - 1 else output.argmax(1)
            else:
                current_token = output.argmax(1)

            timestep += 1

        return outputs

In [13]:
def calculate_accuracy_test(model, input_data, target_data, batch_size):
    correct_count = 0
    total_samples = len(input_data)
    predicted_words=[]
    for idx in range(0, total_samples, batch_size):
        input_batch = input_data[idx:idx + batch_size].to(device)
        target_batch = target_data[idx:idx + batch_size].to(device)

        output = model(input_batch.T, target_batch.T, teacher_force_ratio=0)
        predicted_tokens = torch.argmax(F.softmax(output, dim=2), dim=2).T
        
        for tokens in predicted_tokens:
            predicted_word = indices_to_words(tokens, "marathi")
            predicted_words.append(predicted_word)
            
        correct_count += torch.all(predicted_tokens[:, 1:] == target_batch[:, 1:], dim=1).sum().item()
        
    accuracy = correct_count * 100 / total_samples
    return accuracy,predicted_words


In [14]:
def calculate_accuracy(model, input_data, target_data, batch_size):
    correct_count = 0
    total_samples = len(input_data)
    for idx in range(0, total_samples, batch_size):
        input_batch = input_data[idx:idx + batch_size].to(device)
        target_batch = target_data[idx:idx + batch_size].to(device)

        output = model(input_batch.T, target_batch.T, teacher_force_ratio=0)
        predicted_tokens = torch.argmax(F.softmax(output, dim=2), dim=2).T
            
        correct_count += torch.all(predicted_tokens[:, 1:] == target_batch[:, 1:], dim=1).sum().item()
        
    accuracy = correct_count * 100 / total_samples
    return accuracy


In [15]:
def write_to_csv(actual_english, actual_marathi, predicted_marathi, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Actual English', 'Actual Marathi', 'Predicted Marathi'])
        for eng, mar_actual, mar_pred in zip(actual_english, actual_marathi, predicted_marathi):
            writer.writerow([eng, mar_actual, mar_pred])


In [16]:
def train_model(epochs, learning_rate, cell_type, bidirectional, enc_layers, dec_layers, batch_size, embedding_dim, hidden_size, enc_dropout, dec_dropout,attention):
    pad_idx = len(marathi_chars) + 1

    input_size_encoder = len(english_chars)
    input_size_decoder = len(marathi_chars)
    output_size = len(marathi_chars)
    input_size_encoder+=2
    input_size_decoder+=2
    output_size+=2

    encoder = Encoder(input_size_encoder, embedding_dim, hidden_size, enc_layers,batch_size, enc_dropout,bidirectional, cell_type).to(device)
    decoder= Decoder(input_size_decoder,embedding_dim,hidden_size,output_size,dec_layers,dec_dropout, cell_type,attention,bidirectional).to(device)

    model = Seq2SeqModel(output_size, cell_type, bidirectional, enc_layers, dec_layers ,encoder, decoder).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    for epoch in range(epochs):
        print("Epoch: ", epoch+1)

        model.train()
        total_loss = 0
        val_loss = 0
        step = 0
        total_batches = len(english_matrix) // batch_size

        for batch_idx in tqdm(range(total_batches)):
            start_idx = batch_size * batch_idx
            end_idx = batch_size * (batch_idx + 1)

            inp_data = english_matrix[start_idx:end_idx].to(device)
            target = marathi_matrix[start_idx:end_idx].to(device)
            target = target.T

            optimizer.zero_grad()
            output = model(inp_data.T, target)

            output = output[1:].reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            loss = criterion(output, target)
            total_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            optimizer.step()

            step += 1

        with torch.no_grad():
            model.eval()
            val_batches = len(english_matrix_val) // batch_size
            for val_batch_idx in range(val_batches):
                val_start_idx = batch_size * val_batch_idx
                val_end_idx = batch_size * (val_batch_idx + 1)

                val_inp_data = english_matrix_val[val_start_idx:val_end_idx].to(device)
                val_target = marathi_matrix_val[val_start_idx:val_end_idx].to(device)
                val_target = val_target.T

                val_output = model(val_inp_data.T, val_target)
                val_output = val_output[1:].reshape(-1, val_output.shape[2])
                val_target = val_target[1:].reshape(-1)

                val_loss += criterion(val_output, val_target).item()

            val_loss /= val_batches
        training_accuracy = calculate_accuracy(model, english_matrix, marathi_matrix, batch_size)
        val_accuracy = calculate_accuracy(model, english_matrix_val, marathi_matrix_val, batch_size)
        wandb.log({
            "Epoch": epoch+1,
            "Loss": total_loss / step,
            "Accuracy": training_accuracy,
            "Val_Accuracy": val_accuracy,
            "Val_Loss": val_loss
        })
        print(f"Loss: {total_loss/step}\t Accuracy: {training_accuracy}\t Val_Accuracy: {val_accuracy}\t Val_Loss: {val_loss}")
    test_accuracy,predicted_words=calculate_accuracy_test(model, english_matrix_test, marathi_matrix_test, batch_size)
    wandb.log({"Test_Accuracy_Attention":test_accuracy})
    print(f"Test_Accuracy_Attention = {test_accuracy}")
    write_to_csv(english_test, marathi_test, predicted_words, '/kaggle/working/predictions.csv')    

In [17]:
# Define the sweep configuration
sweep_config = {
    "method": "bayes",
    'metric': {
        'name': 'Val_Accuracy',
        'goal': 'maximize'
    },
    "parameters": {
        "epochs": {"values": [15]},  # Define the hyperparameter search space
        "learning_rate": {"values": [1e-3]},
        "cell_type": {"values": ["LSTM"]},
        "bidirectional": {"values": [True]},
        "enc_layers": {"values": [1]},
        "dec_layers": {"values": [1]},
        "batch_size": {"values": [256]},
        "embedding_dim": {"values": [512]},
        "hidden_size": {"values": [384]},
        "enc_dropout": {"values": [0.2]},
        "dec_dropout": {"values": [0.1]},
        "attention": {"values": [True]}
    }
}

In [18]:
def main():
    # Initialize wandb
    wandb.init()
    config = wandb.config
    wandb.run.name = "_".join([f"{param}:{value}" for param, value in config.items()])
    train_model(**config)

# Initialize the sweep
sweep_id = wandb.sweep(sweep_config, project="deep_learn_assignment_3",entity="cs23m063")

# Run the sweep
wandb.agent(sweep_id, function=main,count=1)

Create sweep with ID: e6lbyd2m
Sweep URL: https://wandb.ai/cs23m063/deep_learn_assignment_3/sweeps/e6lbyd2m


[34m[1mwandb[0m: Agent Starting Run: vvtdh99r with config:
[34m[1mwandb[0m: 	attention: True
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_dropout: 0.1
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	embedding_dim: 512
[34m[1mwandb[0m: 	enc_dropout: 0.2
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_size: 384
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: Currently logged in as: [33mcs23m063[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch:  1


100%|██████████| 200/200 [00:16<00:00, 12.11it/s]


Loss: 1.1620930571854116	 Accuracy: 13.1015625	 Val_Accuracy: 13.818359375	 Val_Loss: 0.42545054480433464
Epoch:  2


100%|██████████| 200/200 [00:15<00:00, 12.97it/s]


Loss: 0.39389967039227486	 Accuracy: 30.232421875	 Val_Accuracy: 28.076171875	 Val_Loss: 0.2865810450166464
Epoch:  3


100%|██████████| 200/200 [00:15<00:00, 12.63it/s]


Loss: 0.27624174661934375	 Accuracy: 40.90625	 Val_Accuracy: 34.8876953125	 Val_Loss: 0.24545917473733425
Epoch:  4


100%|██████████| 200/200 [00:16<00:00, 12.36it/s]


Loss: 0.22588990472257137	 Accuracy: 46.384765625	 Val_Accuracy: 36.71875	 Val_Loss: 0.2569971838966012
Epoch:  5


100%|██████████| 200/200 [00:16<00:00, 12.25it/s]


Loss: 0.19586514294147492	 Accuracy: 49.30859375	 Val_Accuracy: 37.9638671875	 Val_Loss: 0.23181634861975908
Epoch:  6


100%|██████████| 200/200 [00:16<00:00, 12.23it/s]


Loss: 0.17363558892160655	 Accuracy: 53.20703125	 Val_Accuracy: 40.91796875	 Val_Loss: 0.2229575589299202
Epoch:  7


100%|██████████| 200/200 [00:16<00:00, 12.35it/s]


Loss: 0.15566334325820208	 Accuracy: 58.310546875	 Val_Accuracy: 42.1630859375	 Val_Loss: 0.22545903082937002
Epoch:  8


100%|██████████| 200/200 [00:16<00:00, 12.33it/s]


Loss: 0.14601604644209146	 Accuracy: 60.92578125	 Val_Accuracy: 42.919921875	 Val_Loss: 0.21415663417428732
Epoch:  9


100%|██████████| 200/200 [00:16<00:00, 12.33it/s]


Loss: 0.13037221353501083	 Accuracy: 62.71875	 Val_Accuracy: 43.26171875	 Val_Loss: 0.21943324711173773
Epoch:  10


100%|██████████| 200/200 [00:16<00:00, 12.32it/s]


Loss: 0.11920485649257899	 Accuracy: 65.79296875	 Val_Accuracy: 45.849609375	 Val_Loss: 0.22151593584567308
Epoch:  11


100%|██████████| 200/200 [00:16<00:00, 12.33it/s]


Loss: 0.11881094478070736	 Accuracy: 66.455078125	 Val_Accuracy: 45.0927734375	 Val_Loss: 0.2402782216668129
Epoch:  12


100%|██████████| 200/200 [00:16<00:00, 12.34it/s]


Loss: 0.10831052921712399	 Accuracy: 65.8359375	 Val_Accuracy: 44.580078125	 Val_Loss: 0.21053084824234247
Epoch:  13


100%|██████████| 200/200 [00:16<00:00, 12.33it/s]


Loss: 0.10027203606441618	 Accuracy: 72.32421875	 Val_Accuracy: 47.0458984375	 Val_Loss: 0.2166393892839551
Epoch:  14


100%|██████████| 200/200 [00:16<00:00, 12.35it/s]


Loss: 0.08905620243400335	 Accuracy: 73.935546875	 Val_Accuracy: 46.7529296875	 Val_Loss: 0.2210965408012271
Epoch:  15


100%|██████████| 200/200 [00:16<00:00, 12.32it/s]


Loss: 0.0835266475006938	 Accuracy: 75.62109375	 Val_Accuracy: 47.4365234375	 Val_Loss: 0.2262528920546174
Test_Accuracy_Attention = 41.7724609375


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Accuracy,▁▃▄▅▅▅▆▆▇▇▇▇███
Epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
Loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
Test_Accuracy_Attention,▁
Val_Accuracy,▁▄▅▆▆▇▇▇▇██▇███
Val_Loss,█▃▂▃▂▁▁▁▁▁▂▁▁▁▂

0,1
Accuracy,75.62109
Epoch,15.0
Loss,0.08353
Test_Accuracy_Attention,41.77246
Val_Accuracy,47.43652
Val_Loss,0.22625
