In [3]:
!pip install pandas openpyxl



## Removing NAN columns

In [4]:
import pandas as pd
df = pd.read_excel("parallel-corpus.xlsx")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.to_excel("cleaned_file.xlsx", index=False)
print("Unnamed columns removed and file saved as 'cleaned_file.xlsx'.")

Unnamed columns removed and file saved as 'cleaned_file.xlsx'.


In [5]:
df.head(15)

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...
5,To help young people get solid advice they can...,نوجوانوں کو ٹھوس مشورے حاصل کرنے میں مدد کرنے ...
6,"in January1982. Decades later, the series stil...",8 جنوری 1982۔ دہائیوں کے بعد، سیریز اب بھی ایک...
7,Each article is the product of extensive resea...,درحقیقت، اس بات کا تعین کرنے کے لیے کہ نوجوان ...
8,The book you now hold was originally published...,جو کتاب آپ کے پاس ہے وہ اصل میں 1989 میں شائع ...
9,"However, the chapters have been completely rev...",...


In [6]:
!pip install LughaatNLP



In [7]:
import pandas as pd
from tqdm import tqdm
import multiprocessing
from joblib import Parallel, delayed
import LughaatNLP

## removing punctuation

In [8]:
import re

# Urdu punctuations
URDU_PUNCTUATIONS = ['\u200F', '\u200f', '۔', '٫', '٪', '؟', '،', ')', '(', '{', '}', '…', '...', '۔۔۔', '\u002F', '\u003F', '.']

def removing_punctuations(text):
    for punct in URDU_PUNCTUATIONS:
        text = text.replace(punct, " ")
    return text

for index, row in df.iterrows():
    urdu_text = row['MEANING']
    if isinstance(urdu_text, str):
        cleaned_text = removing_punctuations(urdu_text)
        df.at[index, 'MEANING'] = cleaned_text
    else:
        df.at[index, 'MEANING'] = ""
df.to_excel("cleaned_file.xlsx", index=False)
df.head(15)


Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں
1,How can I make friends?’,میں دوست کیسے بنائوں
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...
5,To help young people get solid advice they can...,نوجوانوں کو ٹھوس مشورے حاصل کرنے میں مدد کرنے ...
6,"in January1982. Decades later, the series stil...",8 جنوری 1982 دہائیوں کے بعد سیریز اب بھی ایک...
7,Each article is the product of extensive resea...,درحقیقت اس بات کا تعین کرنے کے لیے کہ نوجوان ...
8,The book you now hold was originally published...,جو کتاب آپ کے پاس ہے وہ اصل میں 1989 میں شائع ...
9,"However, the chapters have been completely rev...",...


In [9]:
import pandas as pd
df = pd.read_excel('cleaned_file.xlsx')
print(df.columns)

Index(['SENTENCES ', 'MEANING'], dtype='object')


In [10]:
import pandas as pd
import string
df = pd.read_excel('cleaned_file.xlsx')
df.columns = df.columns.str.strip()
print(df.columns)

def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    return text
df['SENTENCES'] = df['SENTENCES'].astype(str).apply(remove_punctuation)
df.to_excel('cleaned_file.xlsx', index=False)
print("Punctuation removed and file saved as 'cleaned_file.xlsx'.")


Index(['SENTENCES', 'MEANING'], dtype='object')
Punctuation removed and file saved as 'cleaned_file.xlsx'.


In [11]:
df.head()

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents,میں اپنے والدین سے کیسے بات کروں
1,How can I make friends’,میں دوست کیسے بنائوں
2,Why do I get so sad’,میں اتنا اداس کیوں ہوں
3,If you’ve asked yourself such questions you’re...,اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں تو آ...
4,Depending on where you’ve turned for guidance ...,اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


## Removing Short Conversation

In [12]:
def has_few_words(text: str):
    words = text.split()
    return len(words) <= 3
df = df[~df['MEANING'].apply(lambda text: has_few_words(text) if isinstance(text, str) else True)]
df.to_excel('cleaned_file.xlsx', index=False)

In [13]:
df.head()

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents,میں اپنے والدین سے کیسے بات کروں
1,How can I make friends’,میں دوست کیسے بنائوں
2,Why do I get so sad’,میں اتنا اداس کیوں ہوں
3,If you’ve asked yourself such questions you’re...,اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں تو آ...
4,Depending on where you’ve turned for guidance ...,اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


In [14]:
import pandas as pd
df = pd.read_excel('cleaned_file.xlsx')
df.columns = df.columns.str.strip()
def remove_short_sentences(text, min_length=5):
    if isinstance(text, str):
        return text if len(text.split()) >= min_length else None
    return text
df['SENTENCES'] = df['SENTENCES'].apply(remove_short_sentences)
df = df.dropna(subset=['SENTENCES'])
df.to_excel('cleaned_file.xlsx', index=False)
print("Short sentences removed and file saved as 'cleaned_file.xlsx'.")

Short sentences removed and file saved as 'cleaned_file.xlsx'.


## Tokenization

In [15]:
import pandas as pd
import LughaatNLP

df = pd.read_excel('cleaned_file.xlsx')

urdu_text_processing = LughaatNLP.LughaatNLP()
df['tokenized_text'] = None
def tokenize_and_update(df):
    for index, row in df.iterrows():
        tokens = urdu_text_processing.urdu_tokenize(row['MEANING'])
        df.at[index, 'tokenized_text'] = tokens
tokenize_and_update(df)
df.to_excel('cleaned_file.xlsx', index=False)
print(df[['SENTENCES', 'MEANING', 'tokenized_text']].head())


                                           SENTENCES  \
0              How can I communicate with my parents   
1                            How can I make friends’   
2                               Why do I get so sad’   
3  If you’ve asked yourself such questions you’re...   
4  Depending on where you’ve turned for guidance ...   

                                             MEANING  \
0                 میں اپنے والدین سے کیسے بات کروں     
1                             میں دوست کیسے بنائوں     
2                           میں اتنا اداس کیوں ہوں     
3  اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں  تو آ...   
4   اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...   

                                      tokenized_text  
0           [میں, اپنے, والدین, سے, کیسے, بات, کروں]  
1                          [میں, دوست, کیسے, بنائوں]  
2                       [میں, اتنا, اداس, کیوں, ہوں]  
3  [اگر, آپ, نے, اپنے, آپ, سے, ایسے, سوالات, کیے,...  
4  [اس, بات, پر, منحصر, ہے, کہ, آپ, رہنمائی, کے, ..

In [16]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
df = pd.read_excel('cleaned_file.xlsx')
df['tokenized_sentences'] = None
def tokenize_sentences(df):
    for index, row in df.iterrows():
        tokens = word_tokenize(row['SENTENCES'])
        df.at[index, 'tokenized_sentences'] = tokens

tokenize_sentences(df)
df.to_excel('cleaned_file.xlsx', index=False)
print(df[['SENTENCES', 'tokenized_sentences']].head())


                                           SENTENCES  \
0              How can I communicate with my parents   
1                            How can I make friends’   
2                               Why do I get so sad’   
3  If you’ve asked yourself such questions you’re...   
4  Depending on where you’ve turned for guidance ...   

                                 tokenized_sentences  
0      [How, can, I, communicate, with, my, parents]  
1                    [How, can, I, make, friends, ’]  
2                      [Why, do, I, get, so, sad, ’]  
3  [If, you, ’, ve, asked, yourself, such, questi...  
4  [Depending, on, where, you, ’, ve, turned, for...  


In [18]:
!pip install torch



In [19]:
# Load the data from the Excel file
df = pd.read_excel('cleaned_file.xlsx')

# Print the column names to check for mismatches
print("Columns in DataFrame:", df.columns.tolist())


Columns in DataFrame: ['SENTENCES', 'MEANING', 'tokenized_text', 'tokenized_sentences']


In [3]:
# Define necessary imports
import pandas as pd
import unicodedata
import re
import random

SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    return s

# Read language data from the DataFrame
def readLangs(df, reverse=False):
    print("Reading lines...")
    pairs = []

    for _, row in df.iterrows():
        input_sentence = normalizeString(row['SENTENCES'])  # Use the correct column name
        output_sentence = normalizeString(row['MEANING'])   # Use the correct column name
        pairs.append([input_sentence, output_sentence])

    print("Lines read successfully")
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]

    return pairs

# Filter pairs based on length
MAX_LENGTH = 30

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
           len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]  # Apply filterPair to filter pairs

# Prepare the data
def prepareData(df, reverse=False):
    pairs = readLangs(df, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    input_lang = Lang('English')
    output_lang = Lang('Other Language')

    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

# Load the data from the Excel file
df = pd.read_excel('cleaned_file.xlsx')

# Prepare the data
input_lang, output_lang, pairs = prepareData(df, reverse=False)

# Ensure pairs is a list of lists

# Print a random sample of the pairs
print(random.choice(pairs))


Reading lines...
Lines read successfully
Read 22037 sentence pairs
Trimmed to 17817 sentence pairs
Counting words...
Counted words:
English 11536
Other Language 9747
['don’t paint the town red', 'شہر کو سرخ نہ رنگیں']


In [21]:
for pair in pairs:
  print(pair)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['my second time delicious food do try the fillet', 'میری دوسری بار  سوادج کھانا  فلیٹ ازمايیں']
['best pocket friendly restaurant with finger licking food\nburgers and wings were the best', 'انگلی چاٹنے والے کھانے کے ساتھ بہترین پاکٹ فرینڈلی ریستوراں برگر اور پنکھ بہترین تھے']
['best gourmet burgers in town ❤', 'شہر میں بہترین نفیس برگر  ❤']
['awesome burgerstruly deserves a 5 star', 'بہت اچھے برگر  حقیقت میں 5 اسٹار کے مستحق ہیں ! !']
['excellent taste perfect ambiance love the interior\nfood quality is superb and unbelievably cheap', 'بہترین ذايقہ  کامل ماحول  داخلہ سے محبت کرتا ہوں  کھانے کا معیار شاندار ہے  اور ناقابل یقین حد تک سستا']
['best food quality in reasonable price', 'مناسب قیمت میں کھانے کا بہترین معیار']
['taste is very good but service is unsatisfied', 'ذايقہ بہت اچھا ہے لیکن سروس غیر مطمين ہے']
['mutton karahi is very special also lighting is even batter', 'مٹن کرہی بہت خاص ہے اور لايٹنگ بھی بیٹر ہے']
[

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Encoder RNN
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)  # (1, 1, hidden_size)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Decoder with Attention
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = torch.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = torch.relu(output)
        output, hidden = self.gru(output, hidden)

        output = torch.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Hyperparameters
hidden_size = 256
input_lang_size = input_lang.n_words  # Number of words in input language vocabulary
output_lang_size = output_lang.n_words  # Number of words in output language vocabulary


In [23]:
# Convert sentence to tensor of indexes
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# Training iteration
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # Detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


In [24]:
import time
teacher_forcing_ratio=0.3
def asMinutes(s):
    m = s // 60
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))

# Initialize encoder and decoder
encoder = EncoderRNN(input_lang_size, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang_size, dropout_p=0.1).to(device)

# Train
trainIters(encoder, decoder, 75000, print_every=5000)


KeyboardInterrupt: 

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] = encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS token as first input
        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data

            topv, topi = decoder_output.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]


In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('Input:', pair[0])
        print('Target:', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('Predicted:', output_sentence)
        print('')


In [None]:
# Evaluate the model on random pairs from the dataset
evaluateRandomly(encoder, decoder)



In [8]:
# Ensure that the necessary imports are included
import unicodedata
import re

# Normalize function to match the training data processing
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"[^\w\s]", "", s)  # Remove punctuation
    return s
def indexesFromSentence(lang, sentence):
    return [lang.word2index.get(word, None) for word in sentence.split(' ') if word in lang.word2index]


def evaluateCustomSentence(sentence):
    # Normalize the input sentence
    normalized_sentence = normalizeString(sentence)

    # Convert the normalized sentence to tensor of indexes
    input_tensor = tensorFromSentence(input_lang, normalized_sentence)

    # Check for None values
    if any(index is None for index in input_tensor):
        print("Input contains unknown words. Please try a different sentence.")
        return

    output_words, attentions = evaluate(encoder, decoder, normalized_sentence)
    output_sentence = ' '.join(output_words)
    print('Input:', normalized_sentence)
    print('Predicted:', output_sentence)

# Test with your own sentence
evaluateCustomSentence("this is a book")
evaluateCustomSentence("i love her")
evaluateCustomSentence("Ass")

Input: this is a book
Predicted: یہ ایک کتاب ہے <EOS>
Input: i love her
Predicted: میں نے میں سے <EOS>
Input: ass
Predicted: میں سے <EOS>


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Encoder LSTM
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden, cell):
        embedded = self.embedding(input).view(1, 1, -1)  # (1, 1, hidden_size)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        return output, hidden, cell

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

    def initCell(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

MAX_LENGTH=30
# Decoder with Attention
class AttnDecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, cell, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = torch.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output, (hidden, cell) = self.lstm(output, (hidden, cell))

        output = torch.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, cell, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

    def initCell(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


# Hyperparameters
hidden_size = 256
input_lang_size = input_lang.n_words  # Number of words in input language vocabulary
output_lang_size = output_lang.n_words  # Number of words in output language vocabulary

# Convert sentence to tensor of indexes
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# Training iteration
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_cell = encoder.initCell()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden, encoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden
    decoder_cell = encoder_cell

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_cell, _ = decoder(decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_cell, _ = decoder(decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # Detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
teacher_forcing_ratio = 0.18

def asMinutes(s):
    m = s // 60
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))

# Initialize encoder and decoder
encoder = EncoderLSTM(input_lang_size, hidden_size).to(device)
decoder = AttnDecoderLSTM(hidden_size, output_lang_size, dropout_p=0.1).to(device)

# Train
trainIters(encoder, decoder, 75000, print_every=5000)


2m 38s (- 36m 54s) (5000 6%) 4.1519
5m 19s (- 34m 39s) (10000 13%) 4.1373
8m 15s (- 33m 1s) (15000 20%) 4.3799
11m 17s (- 31m 3s) (20000 26%) 4.3126
14m 19s (- 28m 39s) (25000 33%) 4.1827
17m 22s (- 26m 4s) (30000 40%) 4.0525
20m 29s (- 23m 25s) (35000 46%) 4.0403
23m 35s (- 20m 38s) (40000 53%) 3.9024
26m 38s (- 17m 45s) (45000 60%) 3.8002
29m 42s (- 14m 51s) (50000 66%) 3.7711
32m 49s (- 11m 56s) (55000 73%) 3.7211
35m 54s (- 8m 58s) (60000 80%) 3.6277
39m 0s (- 6m 0s) (65000 86%) 3.5809
42m 6s (- 3m 0s) (70000 93%) 3.5518
45m 15s (- 0m 0s) (75000 100%) 3.4956


In [5]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]

        # Initialize hidden and cell states for encoder
        encoder_hidden = encoder.initHidden()
        encoder_cell = encoder.initCell()

        # Initialize encoder outputs to store encoder's hidden states
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        # Pass through the encoder
        for ei in range(input_length):
            encoder_output, encoder_hidden, encoder_cell = encoder(
                input_tensor[ei], encoder_hidden, encoder_cell
            )
            encoder_outputs[ei] = encoder_output[0, 0]

        # Start decoder with SOS token
        decoder_input = torch.tensor([[SOS_token]], device=device)

        # Decoder hidden and cell states are initialized from encoder's final states
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        # Generate output sequence using the decoder
        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_cell, decoder_attention = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs
            )
            decoder_attentions[di] = decoder_attention.data

            # Get top predicted word
            topv, topi = decoder_output.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            # Use the predicted word as the next input
            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]


In [6]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('Input:', pair[0])
        print('Target:', pair[1])

        # Ensure that the correct states are passed during evaluation
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)

        print('Predicted:', output_sentence)
        print('')


In [12]:
evaluateCustomSentence("i love you")


Input: i love you
Predicted: میں تم سے <EOS>
