In [2]:
import pdb
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import sentencepiece as spm
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch
import random

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
nepali_tokenizer = spm.SentencePieceProcessor("spm_files/nepali_tokenizer.model")
roman_tokenizer = spm.SentencePieceProcessor("spm_files/roman_tokenizer.model")

In [5]:
#@title loading data
def file_to_df(filename):
  train_data = open(filename,"r",encoding="utf8").read()
  train_data_lines = train_data.split("\n")
  input_data = []
  target_data=[]
  for line in train_data_lines:
    input,target = line.split("\t")
    input_data.append(input)
    target_data.append(target)

  return pd.DataFrame({"input":input_data,"target":target_data})

train_data_df = file_to_df("very_small_train_data.txt")
valid_data_df=file_to_df("very_small_valid_data.txt")

PAD_TOKEN = nepali_tokenizer.pad_id()
def pad_and_remove_longer_tokens(row):
  input = row.input
  target = row.target
  if(len(input) <=100 and len(target)<=100):
        input = input + [PAD_TOKEN] * (100 - len(input))
        target = target + [PAD_TOKEN] * (100 - len(target))
        return input,target
  else:
    return None


EOS_TOKEN = nepali_tokenizer.eos_id()
def tokenize_in_roman(data):
  return roman_tokenizer.Encode(data)

def tokenize_in_nepali(data):
  return nepali_tokenizer.Encode(data) + [EOS_TOKEN]


train_input_tokens = train_data_df["input"].apply(tokenize_in_roman)
train_target_tokens = train_data_df["target"].apply(tokenize_in_nepali)
train_tokens_df = pd.DataFrame({"input":train_input_tokens,"target":train_target_tokens})

valid_input_tokens=valid_data_df["input"].apply(tokenize_in_roman)
valid_target_tokens=valid_data_df["target"].apply(tokenize_in_nepali)
valid_tokens_df = pd.DataFrame({"input":valid_input_tokens,"target":valid_target_tokens})

train_short_tokens=train_tokens_df.apply(pad_and_remove_longer_tokens,axis=1)
train_short_tokens = train_short_tokens.dropna()
train_short_tokens_array = train_short_tokens.values

valid_short_tokens=valid_tokens_df.apply(pad_and_remove_longer_tokens,axis=1)
valid_short_tokens = valid_short_tokens.dropna()
valid_short_tokens_array = valid_short_tokens.values

train_small_tokens_input_list=[]
train_small_tokens_target_list=[]
for value in train_short_tokens_array:
  train_small_tokens_input_list.append(value[0])
  train_small_tokens_target_list.append(value[1])

train_small_tokens_df = pd.DataFrame({"input":train_small_tokens_input_list,"target":train_small_tokens_target_list})

valid_small_tokens_input_list=[]
valid_small_tokens_target_list=[]
for value in valid_short_tokens_array:
  valid_small_tokens_input_list.append(value[0])
  valid_small_tokens_target_list.append(value[1])

valid_small_tokens_df = pd.DataFrame({"input":valid_small_tokens_input_list,"target":valid_small_tokens_target_list})
class trainDataset(Dataset):
  def __init__(self,df):
    super().__init__()
    self.input = df["input"].values
    self.target=df["target"].values

  def __len__(self):
    return len(self.input)

  def __getitem__(self, index):
    return (torch.tensor(self.input[index]),torch.tensor(self.target[index]))

def split_dataset(df):
  train_data ,test_data = train_test_split(train_small_tokens_df,train_size=0.8,shuffle=True)
  train_data=train_data.reset_index()
  test_data=test_data.reset_index()

  train_dataset = trainDataset(train_data)
  test_dataset = trainDataset(test_data)

  return train_dataset,test_dataset

def split_dataloader(train_dataset,test_dataset,batch_size=64):
  train_dataloader = DataLoader(train_dataset,batch_size=batch_size)
  test_dataloader = DataLoader(test_dataset,batch_size=batch_size,shuffle=True)
  return train_dataloader,test_dataloader



In [6]:
train_dataset , test_dataset = split_dataset(train_small_tokens_df)
valid_dataset =  trainDataset(valid_small_tokens_df)


train_dataloader, test_dataloader = split_dataloader(train_dataset,test_dataset)
valid_dataloader = DataLoader(valid_dataset,batch_size=64,shuffle=True)

In [7]:
MAX_LENGTH = 100
VOCAB_SIZE = 1000

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size,hidden_size,dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [9]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

In [10]:
SOS_TOKEN= nepali_tokenizer.bos_id()
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size,dropout_p=0.1):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2*hidden_size, hidden_size,batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None,teacher_forcing_ratio=0.5):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long,device=device).fill_(SOS_TOKEN)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights  = self.forward_step(decoder_input, decoder_hidden, encoder_outputs)
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None and random.random() < teacher_forcing_ratio:
                # Teacher forcing: Feed the target as the next input
                  decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)
        return decoder_outputs, decoder_hidden, attentions # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden,encoder_outputs):
        embedded = self.dropout(self.embedding(input))
        query = hidden.permute(1,0,2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)
        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)
        # breakpoint()
        return output, hidden, attn_weights

In [11]:
encoder = EncoderRNN(VOCAB_SIZE, 256).to(device)
decoder = DecoderRNN(256, VOCAB_SIZE).to(device)

In [12]:
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [13]:
def train_step(input_tensor, target_tensor, encoder, decoder, optimizer, criterion, teacher_forcing_ratio=0):
    encoder_hidden = None  # Initial hidden state is None for GRU

    optimizer.zero_grad()

    input_tensor = input_tensor.to(device)
    target_tensor = target_tensor.to(device)

    encoder_outputs, encoder_hidden = encoder(input_tensor)

    decoder_output, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

    loss = criterion(decoder_output.view(-1, VOCAB_SIZE), target_tensor.view(-1))

    loss.backward()
    optimizer.step()

    return loss.item()


In [14]:
def train(encoder, decoder, train_dataloader, optimizer, criterion, num_epochs=10):
    for epoch in range(num_epochs):
        total_loss = 0
        for input_tensor, target_tensor in train_dataloader:
            loss = train_step(input_tensor, target_tensor, encoder, decoder, optimizer, criterion)
            total_loss += loss

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")


In [17]:
def evaluate(encoder, decoder, dataloader, criterion):
    encoder.eval()
    decoder.eval()

    total_loss = 0
    with torch.no_grad():
        for input_tensor, target_tensor in dataloader:
            input_tensor = input_tensor.to(device)
            target_tensor = target_tensor.to(device)

            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_output, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

            loss = criterion(decoder_output.view(-1, VOCAB_SIZE), target_tensor.view(-1))
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Validation Loss: {avg_loss:.4f}")

    encoder.train()
    decoder.train()


In [None]:
num_epochs = 30  # Adjust as needed

for epoch in range(num_epochs):
    train(encoder, decoder, train_dataloader, optimizer, criterion, num_epochs=1)
    evaluate(encoder, decoder, valid_dataloader, criterion)


tensor([[[-0.0515, -0.3681,  0.4504,  ...,  0.0201,  0.0135,  0.2471]],

        [[-0.2960, -0.3298,  0.1043,  ..., -0.7728,  0.5003, -0.1448]],

        [[-0.7270,  0.0708, -0.4237,  ...,  0.4888,  0.5825, -0.0470]],

        ...,

        [[ 0.6543, -0.4009, -0.2980,  ..., -0.3993, -0.0213, -0.5545]],

        [[ 0.3581, -0.7107,  0.2262,  ...,  0.7035, -0.2542,  0.5541]],

        [[ 0.5336, -0.0025,  0.2920,  ..., -0.4206, -0.0582, -0.1488]]],
       grad_fn=<BmmBackward0>)


In [20]:
# torch.save(encoder.state_dict(), "/content/drive/MyDrive/very_small_encoder_bpe.pth")
# torch.save(decoder.state_dict(), "/content/drive/MyDrive/very_small_decoder_bpe.pth")

Loading saved model

In [21]:
encoder.load_state_dict(torch.load("very_small_encoder_bpe.pth",map_location=torch.device('cpu')))
decoder.load_state_dict(torch.load("very_small_decoder_bpe.pth",map_location=torch.device("cpu")))

<All keys matched successfully>

In [22]:
sent = "timi ko ho"
tokens = roman_tokenizer.Encode(sent)
input_tensor = torch.tensor(tokens).unsqueeze(0).to(device)
encoder_outputs, encoder_hidden = encoder(input_tensor)
decoder_output, _, _ = decoder(encoder_outputs, encoder_hidden)
predicted_indices = torch.argmax(decoder_output[0], dim=1)
predicted_tokens = nepali_tokenizer.Decode(predicted_indices.tolist())
predicted_tokens

'ती ती ती ती हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो हो'

In [22]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

In [32]:
def showAttention(input_sentence, output_words, attentions):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(encoder, decoder, valid_dataloader,criterion)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions[0, :len(output_words), :])

In [26]:
sent = "timro ghar kaha thiyo malai nasodha"
tokens = roman_tokenizer.Encode(sent) 
tokens= tokens + [PAD_TOKEN]*(100-len(tokens))
input_tensor = torch.tensor(tokens).unsqueeze(0).to(device)
encoder_outputs, encoder_hidden = encoder(input_tensor)
decoder_output, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)
_, topi = decoder_output[0].topk(1)
decoded_ids = topi.squeeze()
decoded_tokens = nepali_tokenizer.Decode(decoded_ids.tolist())
decoded_tokens

'तिम्रो घर कहाँ थियो मलाई नसोध'