# Basic Imports and Requirements

In [None]:
import os
import torch
import torch.nn as nn                # This is for importing Neural networks
from torch import optim              # Importing Optimisers
import torch.nn.functional as F      # Activation functions ,such as Relu, Softmax etc
import csv                           # Importing CSV reader and writer
import random                        # This is to create random numbers
import re                            # Support for regular expressions (RE)
import unicodedata                   # This module provides access to the Unicode Character Database which defines character properties for all Unicode characters.
import itertools                     # Functional tools for creating and using iterators. Infinite iterators:
import codecs                        # codecs -- Python Codec Registry
import numpy                         # for numpy and in-place operation
import torchvision
import matplotlib.pyplot as plt      # For visualisation
from google.colab import files       # Importing files to google colab
import pandas as pd                  # Import txt files
import tqdm                          # progress bar for checking progress


In [None]:
# Changing from cuda to cpu and cpu to cuda

isCUDA = torch.cuda.is_available()
device = torch.device("cuda" if isCUDA else "cpu")
print(isCUDA , "," , device)

True , cuda


# Preprocessing Section

## Data Understanding

Snippet of the corpus:

**raw_script_urls.txt** :
m450 +++$+++ my girl 2 +++$+++ http://www.scifiscripts.com/msol/my_girl_2.html
<br>

**movie_characters_metadata.txt** :
u6745 +++$+++ BOY +++$+++ m450 +++$+++ my girl 2 +++$+++ ? +++$+++ ? <br>

**movie_titles_metadata** 
<br>
m450 +++$+++ my girl 2 +++$+++ 1994 +++$+++ 4.80 +++$+++ 5689 +++$+++ ['comedy', 'drama', 'family', 'romance']

**movie_conversations.txt**
<br>
u6745 +++$+++ u6760 +++$+++ m450 +++$+++ ['L403280', 'L403281', 'L403282', 'L403283', 'L403284', 'L403285', 'L403286', 'L403287', 'L403288']

u6745 +++$+++ u6760 +++$+++ m450 +++$+++ ['L403289', 'L403290', 'L403291', 'L403292']

**movie_lines.txt**
<br>
L403280 +++$+++ u6760 +++$+++ m450 +++$+++ VADA +++$+++ How did you know my name?<br>
L403281 +++$+++ u6745 +++$+++ m450 +++$+++ BOY +++$+++ Your Uncle Phil told me. <br>
L403282 +++$+++ u6760 +++$+++ m450 +++$+++ VADA +++$+++ Where is he?? He was supposed to meet me.<br>
L403283 +++$+++ u6745 +++$+++ m450 +++$+++ BOY +++$+++ Hey relax, you think I kidnapped him or something?<br>
L403284 +++$+++ u6760 +++$+++ m450 +++$+++ VADA +++$+++ This is California, anything is possible.<br>
L403285 +++$+++ u6745 +++$+++ m450 +++$+++ BOY +++$+++ Well if I was looking for a victim, I definitely wouldn't pick your Uncle Phil who outweighs me by about 150 pounds, besides, who would I ask for ransom? You??<br>
L403286 +++$+++ u6760 +++$+++ m450 +++$+++ VADA +++$+++ Are you suffering from a chemical imbalance or is it just an attitude problem.<br>
L403287 +++$+++ u6745 +++$+++ m450 +++$+++ BOY +++$+++ My only problem is that your Uncle Phil is giving me five bucks to pick you up but I don't get paid 'till delivery.<br>
L403288 +++$+++ u6760 +++$+++ m450 +++$+++ VADA +++$+++ Gee, that is a problem.<br>
L403289 +++$+++ u6760 +++$+++ m450 +++$+++ VADA +++$+++ Put that down, I'll...I'll call the police!<br>
L403290 +++$+++ u6745 +++$+++ m450 +++$+++ BOY +++$+++ What are you gonna do? Tell them that...a polite person helped carry your bag?<br>
L403291 +++$+++ u6760 +++$+++ m450 +++$+++ VADA +++$+++ I don't think you're very polite.<br>
L403292 +++$+++ u6745 +++$+++ m450 +++$+++ BOY +++$+++ Yeah, well I don't think you're very grateful. A lot of people in your position would say &quot;thank you&quot;.<br>


## Importing Data and Analysis

In [None]:
# Code for mounting operations to Google Colab

# from google.colab import drive
# drive.mount('/content/drive/MyDrive')

In [None]:
Conversation_data = '/content/drive/My Drive/Datasets_Data_Science/cornell movie-dialogs corpus/movie_conversations.txt'
with open(Conversation_data , 'r' ) as con_file:
  Con_lines = con_file.readlines()
  # print(Con_lines[:10])                           
  for Con_line in Con_lines[:10]:
    print(Con_line.strip())                # reading lines and stripping spaces and line indentation

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']


In [None]:
Lines_data = "/content/drive/My Drive/Datasets_Data_Science/cornell movie-dialogs corpus/movie_lines.txt"

with open(Lines_data , encoding="ISO-8859-1" , mode='r') as Li_file:   # Challenges_1 : If you get "'utf-8' codec can't decode byte" error use encoding as "ISO-8859-1" encoding.
  Lines_lines = Li_file.readlines()

  for Li_lines in Lines_lines[:10]:
    print(Li_lines.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No
L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?
L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?


In [None]:
# with open(Lines_data , encoding="ISO-8859-1" , mode='r') as f:
#   for Lines_line in f:
#     values = Lines_line.split(" +++$+++ ")
#     print(values[:1])

In [None]:
# line_fields = ["lineID" , "characterID" , "movieID", "character" , "text"]
# values = ["L1045", "u0", "m0", "BIANCA" ,  "They do to!"]

# obj= {}
# for i, field in enumerate(line_fields):
#   obj[field] = values[i]

# print(obj)

In [None]:
# Split the line of the file into a dictionary of fields (lineID , characterID , movieID , character , text) as we do not want "+++$+++" for "movie_lines.txt"

line_fields = ["lineID" , "characterID" , "movieID", "character" , "text"]

Li_lines = {}

with open(Lines_data , encoding="ISO-8859-1" , mode='r') as f:
  for Lines_line in f:
    values = Lines_line.split(" +++$+++ ")

    # Extract Fields
    lineobj = {}                           # define a dictionary

    for i , field in enumerate(line_fields):
      lineobj[field] = values[i]

      
    Li_lines[lineobj["lineID"]] = lineobj            # Key is the line ID and value is all the sentence

In [None]:
# dict(list(Li_lines.items())[0:2])  # printing just 2 values to check , as we get "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)" error in Google colab

In [None]:
# Li_lines["L194"]

In [None]:
# Split the line of the file into a dictionary of fields (Character_1ID , Character_2ID , movieID , ConversationID) as we do not want "+++$+++" for "movie_conversations.txt"

Conv_lables = ["Character_1ID","Character_2ID" , "movieID" , "ConversationID" ]

Converse_list = []

with open(Conversation_data , encoding="ISO-8859-1" , mode="r") as conversations:
  for conv  in conversations:
    Converse = conv.split(" +++$+++ ")

    # Extracting values for lables
    Con_obj={}
    for i, item in enumerate(Conv_lables):
      Con_obj[item]=Converse[i]
  
    # Convert string results from split to list , since con_obj["ConversationID"] = "['L666520', 'L666521', 'L666522']\n"

    lineIds = eval(Con_obj["ConversationID"])        # Check the appendix in the last cell of this notebook
    # output: ['L666520', 'L666521', 'L666522']  

    # reassemble lines
    Con_obj["lines"]  = []                          # we are adding all the values to the newly created key called "lines"
    for lineId in lineIds:
      Con_obj["lines"].append(Li_lines[lineId])

    Converse_list.append(Con_obj)       # Here we are just appending the structure to list 

# Converse_list[0:1]

In [None]:
print("length of the lines" , len(Converse_list[0]["lines"])) # length of every element in lines block

length of the lines 4


In [None]:
print(Converse_list[0]["lines"][0]["text"].strip())  # Our goal is to extract all the text from the lists / elements in the lines block
print(Converse_list[0]["lines"][1]["text"].strip())

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.


In [None]:
# Extract Pairs of Sentence from Conversation

qa_pair = []

for conversation in Converse_list:

  for i in range(len(conversation["lines"])-1):
    question = conversation["lines"][i]["text"].strip()     # this is question which we are extracting from the block of first conversation ID
    answer = conversation["lines"][i+1]["text"].strip()     # this is answer corrusponding to the question which we are extracting from the block of first conversation ID

    # Filter Condition "igonore if question and answer is empty"

    if question and answer:
      qa_pair.append([question , answer])


In [None]:
qa_pair[0:10]

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"],
 ["You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.'],
 ['Cameron.',
  "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does."],
 ["The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
  'Seems like she could get a date easy enough...'],
 [

In [None]:
# Define path to write qa_pair to new file

New_path = '/content/drive/My Drive/Datasets_Data_Science/cornell movie-dialogs corpus/New_formatted_Movie_conversations.txt'  # Saving the file as a new formatted file

delimiter = '\t'

# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Writing the txt file
with open(New_path , encoding="utf-8" , mode="w") as Outfile:

  writer = csv.writer(Outfile, delimiter = delimiter)

  for pair in tqdm.tqdm(qa_pair):
    writer.writerow(pair)
    

100%|██████████| 221282/221282 [00:00<00:00, 268284.94it/s]


## Importing new Formatted File

In [None]:
# Visualising the file if it is properly written

New_formatted = '/content/drive/My Drive/Datasets_Data_Science/cornell movie-dialogs corpus/New_formatted_Movie_conversations.txt'

with open(New_formatted ,mode="rb") as formatted:
  lines = formatted.readlines()

  for line in (lines[:8]):
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\n"
b'Why?\tUnsolved myster

In [None]:
# we will be creating word vocablary for the newly formatted file

PAD_TOKEN = 0  # Used for padding short sentence  , as at the end we are going to use fixed size length
SOS_TOKEN = 1  # Start of sentence token <START>
EOS_TOKEN = 2  # End of sentence token <END>

class Vocabulary:

  def __init__(self , name):
    self.name=name               # Name of the dateset we are using
    self.trimmed = False         # Setting Trim false
    self.word2index = {}         # we are creating dictionary for word to index
    self.word2count = {}         # we are creatung dictionary to count number of words to filter the less count of words
    
    self.index2word = {PAD_TOKEN:"PAD" , SOS_TOKEN:"SOS" , EOS_TOKEN:"EOS"}         # we are giving the custome index for the words so that it can be identifible as per LSTM encoder and decoder
    self.num_words = 3           # Count of default TOKENS PAD , SOS , EOS
    



  def addSentence(self , sentence):      # we are trying to split the sentence and add the sentence to word2index with other function called addWord()
    for word in sentence.split(' '):
      self.addWord(word)


  def addWord(self, word):               # we are adding words to word2index if they are not present in word2index
    if word not in self.word2index:
      self.word2index[word] = self.num_words  # We are starting with 3 as index "0" is PAD_TOKEN , "1" is START_TOKEN , "2" is END_TOKEN and then increment num_words below
      self.word2count[word] = 1               # count the words in the word2index already exists or not if it exists increment below , based on this we will filter below
      self.index2word[self.num_words] = word # Incerement the word index after 3 the word would be 4 hence index2word[4] = word , this is opposite of word2index as we are mapping.
      self.num_words +=1                      # Incrementing the num_words count after adding word to word2count

    else:
      self.word2count[word] +=1               # Incrementing value of word2count if count already found as "1"


      # Remove words below certain criteria (filtering words based in Count)

  def trim(self, min_count):                 # paremeter min_count to imput the criteria for filtering
    if self.trimmed:
      return
    self.trimmed = True

    keep_words = []                          # Collecting all the words above minimum thrishold

    for k , v in self.word2count.items():        # Taking the key , value in word2count.items()
      if v >= min_count:                    # checking if the value is above the thrishold (min_count)
        keep_words.append(k)                 # appending key if the value is above than thrishold



    print("keep_words {} / {} = {:0.4f}".format(len(keep_words) , len(self.word2index), len(keep_words) / len(self.word2index)))


    # Reinitialise the dictionaries as they are updated again after trim ,  and previously they had older words

    self.word2index = {}         # we are creating dictionary for word to index
    self.word2count = {}         # we are creatung dictionary to count number of words to filter the less count of words
    # print(dict(list(self.word2count.items())))
  
    self.index2word = {PAD_TOKEN:"PAD" , SOS_TOKEN:"SOS" , EOS_TOKEN:"EOS"}         # we are giving the custome index for the words so that it can be identifible as per LSTM encoder and decoder
    self.num_words = 3           # Count of default TOKENS PAD , SOS , EOS


    for word in keep_words:
      self.addWord(word)


## Text Processing

In [None]:
# Truen Unicode string to plain ASCII , to remove the  accents (normalize) from the text Example given below 
# Consider you are writing a code that compares strings in the various language if you do normal comparison it may give inaccurate results.

def unicodeToAscii(s):
  return ''.join(c for c in unicodedata.normalize("NFD" , s) if unicodedata.category(c) !="Mn")   # unicode category of text which is not equal to Nonspacing Mark (Mn) or non markmarking space  , 


In [None]:
# Example for accent normalisation , test function
unicodeToAscii('Français , Passionné de données')
      # output: Francais , Passionne de donnees

'Francais , Passionne de donnees'

In [None]:
# Lowercase , trim white spaces , lines ....etc ,  and remove non-letter character

def normalizeString(s):
  
  s = unicodeToAscii(s.lower().strip())

  # replace any ".!?" by whitespace + the character -->"|" = " |". \1 means the first bracketed group  -->[.!?]. 
  # r is to not consider \1 as a character ( r to escape a backslash).  

  s = re.sub(r"([.!?])", r" \1", s)
  # print(s)

  # Remove any character that is not a sequence of lower case or upper case character , + means one or more

  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  # print(s)

  # Remove a sequence of white space characte 

  s = re.sub(r"\s+",r" " , s).strip()

  return s

In [None]:
# Example for normalizeString , test function
normalizeString("aab1234asdf!s's   ss?")
# output:        aab asdf !s s ss ?

'aab asdf !s s ss ?'

In [None]:
# Visualising the file if it is properly written

# New_formatted = '/content/drive/My Drive/Datasets_Data_Science/cornell movie-dialogs corpus/New_formatted_Movie_conversations.txt'

print("Execution Started ....")
# Read the files and split into lines
read_lines = open(New_formatted , encoding="utf-8").read().strip().split('\n')
# print(read_lines[:10])

# Split every line into pairs and normalize

normalize_pairs = [[normalizeString(s) for s in pair.split("\t")] for pair in read_lines]
# print(normalize_pairs[0:10])

voc = Vocabulary("cornell movie-dialogs corpus")

print("Execution Done ....")

Execution Started ....
Execution Done ....


In [None]:
print("length of the normalized pairs :" ,len(normalize_pairs) ,"\n")
normalize_pairs[:5]

length of the normalized pairs : 221282 



[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought we d start with pronunciation if that s okay with you .'],
 ['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?'],
 ['you re asking me out . that s so cute . what s your name again ?',
  'forget it .'],
 ['no no it s my fault we didn t have a proper introduction', 'cameron .']]

In [None]:
# This code is for testing

print(len(normalize_pairs[0][0].split())<10)
len(normalize_pairs[0][1].split())<10

False


False

In [None]:
# Return true if both sentences  in the normalize_pairs "P" are under the max length thrishold

MAX_LENGTH = 10    # maximum sentence lenth to be considered
def filterPair(p): # Initial function to filter later
# Input sequence needs to be preserve  the last word for EOS taken

  return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

# Filter pair using filter pair condition (reccursion finction)
def filterPairs(pairs1):
  return [pair for pair in pairs1 if filterPair(pair)]  # filterig the pair by recurssion for each pair in input sentence

In [None]:
print( "There are {} pairs/conversations in the dataset ".format(len(normalize_pairs)))

filtered_pairs = filterPairs(normalize_pairs)

print("After filtering there are {} pairs in the dataset".format(len(filtered_pairs)))

# filtered_pairs[0]

There are 221282 pairs/conversations in the dataset 
After filtering there are 64271 pairs in the dataset


In [None]:
filtered_pairs[0:5]

[['there .', 'where ?'],
 ['you have my word . as a gentleman', 'you re sweet .'],
 ['hi .', 'looks like things worked out tonight huh ?'],
 ['you know chastity ?', 'i believe we share an art instructor'],
 ['have fun tonight ?', 'tons']]

In [None]:
# Loop through each pair of and add the question and replace sentence to the vocabulary

for pair in filtered_pairs:
  voc.addSentence(pair[0])
  voc.addSentence(pair[1])
print("counted words : " , voc.num_words , "\n")

print("pairs :" , "\n" )
for pair in filtered_pairs[0:5]:
  print(pair)

counted words :  18008 

pairs : 

['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']


In [None]:
MIN_COUNT= 3 # minimum wordcount thrishold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
  # Trim wordds used under min count for voc
  voc.trim(MIN_COUNT)
  # Filter out pairs with trimmed words
  keep_pairs = []
  for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input =  True
    keep_output= True

    # Check input sentence
    for word in input_sentence.split(" "):
      if word not in voc.word2index:
        keep_input = False
        break

    # check output sentence

    for words in output_sentence.split(" "):
      if words not in voc.word2index:
        keep_output = False
        break

    # only keep pairs that do not contain  trimmed word(s) in there input or output sentence
    if keep_input and keep_output:
      keep_pairs.append(pair)

  print("Trimmed from {} pairs to {} , {:4f} of total".format(len(pairs), len(keep_pairs) , len(keep_pairs)/len(pairs)))

  return keep_pairs

In [None]:
# Trim voc and pairs  # This will use it for further processing
pairs_trim = trimRareWords(voc, filtered_pairs, MIN_COUNT)

keep_words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165 , 0.827200 of total


In [None]:
pairs_trim[1]

['you have my word . as a gentleman', 'you re sweet .']

## Prepare Data for Models

In [None]:
# Experiment Code
def indexesFromSentences(voc , sentence):
  return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_TOKEN]

In [None]:
  # Testing the function
print(indexesFromSentences(voc, pairs_trim[1][0]))

[7, 8, 9, 10, 4, 11, 12, 13, 2]


In [None]:
# Define some samples for testing converting vocabulary of the input sentence to EOS_tokens

input_sent = []
output_sent = []

for pair in tqdm.tqdm(pairs_trim[0:10]):
  input_sent.append(pair[0])
  output_sent.append(pair[1])

print(input_sent)
indexes = [indexesFromSentences(voc, sentence) for sentence in input_sent]
indexes

100%|██████████| 10/10 [00:00<00:00, 21575.64it/s]

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']





[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [58, 2]]

In [None]:
list(itertools.zip_longest(*indexes)) # we require this transpose to sent it to LSTM for attention score processing

[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, None, 48, 52, None),
 (None, 10, None, 6, 4, 38, None, 40, 6, None),
 (None, 4, None, 2, 4, 7, None, 45, 2, None),
 (None, 11, None, None, 2, 39, None, 49, None, None),
 (None, 12, None, None, None, 40, None, 6, None, None),
 (None, 13, None, None, None, 41, None, 2, None, None),
 (None, 2, None, None, None, 4, None, None, None, None),
 (None, None, None, None, None, 2, None, None, None, None)]

In [None]:
# Appliying zero padding as ther input size should be retained , this is hust like padding= "same" in tensorflow.

def zero_padding(l, fill_value = 0):
  return list(itertools.zip_longest(*l , fillvalue=fill_value))         # *l will transpose the matrix for ex: [1,2] becomes [1]
                                                                                                                           # [2]

In [None]:
Inde_length = [len(inde) for inde in tqdm.tqdm(indexes)]  # Storing the length of each list in Inde_length
max(Inde_length)                                          # getting maximum length of each list

100%|██████████| 10/10 [00:00<00:00, 55043.36it/s]


10

In [None]:
Inde_length

[3, 9, 3, 5, 6, 10, 2, 8, 5, 2]

In [None]:
# Test the function

test_results = zero_padding(indexes)
print(len(test_results))
test_results

10


[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [None]:
# Experement
matrix = []
for i , seq in enumerate(tqdm.tqdm(test_results)):               # enumerating the value in the sequence
  matrix.append([])

matrix

100%|██████████| 10/10 [00:00<00:00, 42711.85it/s]


[[], [], [], [], [], [], [], [], [], []]

In [None]:
def binary_matrix(lis , values = 0):
  matrix = []
  for i , seq in enumerate(tqdm.tqdm(lis)):    # enumerating the value in the sequence
    matrix.append([])                          # Creating Structure to store the values
    for token in seq:                          # taking values from the token
      if token == PAD_TOKEN:                   # checking if token is a number or is = to PAD_TOKEN which is "0"
        matrix[i].append(0)
      else:
        matrix[i].append(1)
  return matrix

In [None]:
binary_result = binary_matrix(test_results)               # This is mask for the tensors/matrix for our output matrix
binary_result

100%|██████████| 10/10 [00:00<00:00, 41692.88it/s]


[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [None]:
# Return padding  input sequence tensors and as well as a tensor of length for each  of the sequence in batch

def inputVar(in_var , voc):                                     # Input only Questions in pair 
  indexes_batch = [indexesFromSentences(voc,sentence) for sentence in in_var]  # Apply Indexsfromsentence function
  lengths = torch.tensor([len(indexes) for indexes in indexes_batch])  # Getting the length of all the indexes
  pad_List = zero_padding(indexes_batch)                # Applying Zero_padding to indexes batch
  padVar  = torch.LongTensor(pad_List)                    # indices (LongTensor) – the indices into self
  return padVar , lengths                                                


In [None]:
# Return padding target sequence tensor , padding mask , and max target lengths

def outputVar(Out_var , voc):
  indexes_batch = [indexesFromSentences(voc,sentence) for sentence in Out_var]  
  max_target_lengths = max([len(inde) for inde in indexes])      # Getting only max length of the lists
  padList = zero_padding(indexes_batch)
  mask = binary_matrix(padList)
  mask = torch.ByteTensor(mask)                                  # to make sure that the binary_matrix/mask is either "0" or "1"
  padVar = torch.LongTensor(padList)
  return padVar , mask, max_target_lengths

In [None]:
# Return all items for a given batch of pairs

def batch2TrainData(voc ,  pair_batch):
  # Sort the Questions in deceending length
  pair_batch.sort(key=lambda x: len(x[0].split(' ')) ,  reverse=True)  # here we are taking length of input Question to sort 

  input_batch, output_batch = [] , []                                            # Defining two empty lists for capturing the data for input batch and output batch
  for pair in pair_batch:
    input_batch.append(pair[0])
    output_batch.append(pair[1])

  input_ ,  length = inputVar(input_batch , voc)                              # Function taken from InputVar
  output, mask, max_target_lengths = outputVar(output_batch , voc)            # Function taken from OutputVar

  return input_ , length, output, mask , max_target_lengths                   # returning all the results for batch to process

In [None]:
# For testing we will take a very small batch of (5):

small_batch_size = 5

batches = batch2TrainData(voc , [random.choice(pairs_trim) for _ in range(small_batch_size)])
input_, lengths, output , mask, max_target_len = batches

print("\n","input_variable:", input_)
print("lengths:", lengths)
print("target_variable:", output)
print("mask:", mask)
print("max_target_len:", max_target_len)

100%|██████████| 10/10 [00:00<00:00, 50533.78it/s]


 input_variable: tensor([[  25,  147,   25,   25, 2074],
        [ 200,   92, 1441,  200, 2074],
        [ 459,    7,  253, 1469,    4],
        [  61,  278,  376,    4,    2],
        [  37,    4,    4,    2,    0],
        [ 123,    2,    2,    0,    0],
        [  40,    0,    0,    0,    0],
        [1528,    0,    0,    0,    0],
        [   4,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([10,  6,  6,  5,  4])
target_variable: tensor([[ 122,   88,   42,   33,  318],
        [  25, 3431,   67,    7, 1597],
        [ 200,    4,  253,  290,    4],
        [2378,   92, 1442,    8,    2],
        [  40,    7,    4, 1468,    0],
        [ 280,  123,    2,   70,    0],
        [   4,   96,    0, 2553,    0],
        [   2,    3,    0,    4,    0],
        [   0,    6,    0,    2,    0],
        [   0,    2,    0,    0,    0]])
mask: tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1




# Defining Seq2Seq Model

## EncoderRNN

In [None]:
class EncoderRNN(nn.Module):                                              # nn.Module base class will be overridden by forward()
  def __init__(self, hidden_size , embedding , n_layers=1, dropout=0):
    super(EncoderRNN, self).__init__()
    self.embedding = embedding
    self.n_layers = n_layers
    self.hidden_size = hidden_size

    # Initialize GRU , the input_size and hidden_size params are both set to 'hidden_size'
    # because out input_size is a word embedding with number of features == 'hidden_size'

    self.gru = nn.GRU(hidden_size , hidden_size , n_layers , dropout=(0 if n_layers ==1 else dropout), bidirectional=True)


  def forward(self, input_seq , input_length , hidden=None):
    # input_seq: batch of input sentence ; shape=(max_length , batch_size)
    # input_length: list of sequence lengths corusponding to each sentence in the batch
    # hidden_state of shape : (n_layers x num_directions, batch_size , hidden_size)

    # Convert word index to embedding
    embedded = self.embedding(input_seq)


    # pack padded batch of sequences for RNN Module
    packed = torch.nn.utils.rnn.pack_padded_sequence(embedded , input_length)

    # Forward pass through GRU
    outputs , hidden = self.gru(packed, hidden)

    # unpack padding
    outputs, _ = torch.nn.utils.rnn.pad_padded_sequence(outputs)

    # Sum bidirectional GRU outputs
    outputs = outputs[:,:,:self.hidden_size] +outputs[:,:,self.hidden_size:]

    # return output and final hidden state
    return output, hidden

    # outputs: the output features h_t  from the last layer of the GRU, for each timestamp (sum of bidirectional outputs)
    # outputs shape = (max_length, batch_size , hidden_size)
    # hidden : hidden  state for the last timestep , of shape =(n_layers x num_directions , batch_size, hidden_size)

In [None]:
# Just an example for Gru 
seed = 100
import torch
import torch.nn as nn

rnn= nn.GRU(6,20,2)                           # GRU(input_size , hidden_size , num_layer)
input = torch.randn(5,3,6)                    # (seq, batch , input)
h0 = torch.randn(2,3,20)                      # (num_layers , batch, hidden)
output, hn = rnn(input,h0)                    
print(output.shape)                           # (seq, batch , hidden_size)
print(hn.shape)                               # (layer,  batch, hidden_size)


torch.Size([5, 3, 20])
torch.Size([2, 3, 20])


In [None]:
input

tensor([[[-0.4235, -1.6828,  0.0068,  2.0669, -0.0146, -0.4219],
         [ 0.9035,  0.5738,  0.3317,  1.0563, -0.1278,  0.4790],
         [ 1.6395,  1.5672, -0.2032,  0.3842, -0.7113, -0.2225]],

        [[-0.5376,  0.0094, -0.1541,  0.7763,  0.3985, -0.2435],
         [ 1.0503,  0.5013, -1.3130,  0.7133,  0.4644,  1.3323],
         [ 0.6154, -0.4712, -0.5738,  0.5994, -0.6708,  0.1534]],

        [[ 0.2652, -0.3506,  0.3225, -0.5089,  0.2242,  0.6174],
         [ 0.4700, -1.1933, -1.7227, -0.5616,  1.6748,  0.1667],
         [-0.2521, -0.9129,  1.4716,  0.0738,  0.7192, -0.1969]],

        [[ 1.1567, -0.7507,  0.7029, -1.0551, -0.5565, -0.3290],
         [ 0.7203,  1.4243, -1.0327,  0.6703,  0.6702,  1.2824],
         [ 1.2131, -0.3373, -0.5130,  1.2732, -0.5937,  1.0812]],

        [[-0.5178,  1.0765, -0.8165, -1.4601,  1.1709, -1.0173],
         [ 0.5260,  1.4662,  0.1346, -0.4679, -0.4754, -0.8853],
         [ 0.0866, -0.8976,  1.0008, -0.4557, -1.4694, -2.4440]]])

In [None]:
input[:,:,:2]                          # Just an example for Gru 

tensor([[[-0.4235, -1.6828],
         [ 0.9035,  0.5738],
         [ 1.6395,  1.5672]],

        [[-0.5376,  0.0094],
         [ 1.0503,  0.5013],
         [ 0.6154, -0.4712]],

        [[ 0.2652, -0.3506],
         [ 0.4700, -1.1933],
         [-0.2521, -0.9129]],

        [[ 1.1567, -0.7507],
         [ 0.7203,  1.4243],
         [ 1.2131, -0.3373]],

        [[-0.5178,  1.0765],
         [ 0.5260,  1.4662],
         [ 0.0866, -0.8976]]])

## DecoderRNN

### Attention Layer

In [None]:
# Luong attention layer

class Attn(torch.nn.Module):
  
  def __init__(self, Method ,  hidden_size):
    super(Attn, self).__init__()
    self.Method = Method
    self.hidden_size = hidden_size
    

    def dot_score(self, hidden , encoder_output):                 # Example given below
      return torch.sum(hidden * encoder_output, dim=2)


    def forward(self, hidden, encoder_outputs):
      #hidden of shape : (1,batch_size, hidden_size)
      # encoder_outputs of shape :(max_length , batch_size , hidden_size)

      # This is what we will get (1,batch_size, hidden_size) * (max_length , batch_size , hidden_size) = (max_length , batch_size , hidden_size)

      # calculate the attention weights (energies)

      attn_energies = self.dot_score(hidden , encoder_output)   # (max_length , batch_size)

      # Transpose max length  and batch_size dimension

      attn_energies = attn_energies.t()                 # (batch_size , max_length)

      # Return the softmax normalized probability score (unsqeeze means added dimension "1" in the 1 position ex:(0,1,2 ))
      return F.softmax(attn_energies, dim=1).unsqeeze(1)    #(batch_size,1 ,max_length) dim=1 means Softmax accross the columns


In [None]:
seed = 100
import torch
a= torch.randn(5,3,7)
a

tensor([[[ 0.0669,  1.4491,  1.6847,  0.7856,  1.2006, -0.3105, -0.3280],
         [ 0.5235,  0.4513, -0.0601, -0.7814, -0.0250,  1.3050,  0.6381],
         [ 1.8029, -0.0042,  0.1457, -1.2177, -0.9411,  1.9206,  0.4402]],

        [[ 1.4200,  1.2067, -1.0484,  0.3757,  1.0247, -1.0756,  0.9970],
         [ 0.8325, -0.7680, -0.3911, -0.3193, -1.2576,  1.2911,  0.6845],
         [ 0.6268,  0.6050,  0.9841, -0.4599, -0.9559,  0.8190, -1.7799]],

        [[ 0.4236, -1.2397,  0.5207, -0.8777, -0.4465,  0.6655,  0.5522],
         [-0.4974, -1.3612, -1.0280, -1.1992,  0.5143, -0.1974, -0.0092],
         [-0.2363,  0.1232, -0.0661, -0.6769,  0.0068, -0.0310, -0.5303]],

        [[ 0.2819,  0.5358, -0.7164,  1.0563,  0.2161,  0.8604, -0.2853],
         [ 0.7365, -1.6303,  0.1452,  0.1118, -0.0449,  0.4897, -1.3087],
         [-0.7419, -0.3256, -0.4453, -0.8108,  1.2759, -0.0172, -1.0334]],

        [[-0.8411,  0.6721,  0.5990,  0.1652, -2.4144, -1.6311,  1.1722],
         [-0.9337,  1.3877, -0

In [None]:
torch.sum(a,dim=2) # Summing all the columns 

tensor([[ 4.5484,  2.0513,  2.1464],
        [ 2.9003,  0.0720, -0.1608],
        [-0.4020, -3.7781, -1.4105],
        [ 1.9488, -1.5008, -2.0984],
        [-2.2780, -0.5542,  3.8549]])

In [None]:
list1 = [ 0.0669,  1.4491,  1.6847,  0.7856,  1.2006, -0.3105, -0.3280]
list2 = [ 0.5235,  0.4513, -0.0601, -0.7814, -0.0250,  1.3050,  0.6381]
list3 = [ 1.8029, -0.0042,  0.1457, -1.2177, -0.9411,  1.9206,  0.4402]
print(sum(list1) , round(sum(list2),6) , sum(list3))            # first line and the this line is simillar

4.5484 2.0514 2.1464


In [None]:
class LuongAttnDecoderRNN(nn.Module):
  def __init__(self, attn_model , embedding, hidden_size , output_size , n_layers = 1  dropout=0.1):
    super(LuongAttnDecoderRNN , self).__init__()
    self.attn_model = attn_model
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.dropout = dropout

    # Define layers
    self.embedding = embedding
    self.embedding_dropout = nn.Dropout(dropout)
    self.gru = nn.GRU(hidden_size , hidden_size , n_layers , dropout = (0 if n_layers == 1 else dropout)
    self.concat = nn.Linear(hidden_size * 2 , hidden_size)
    self.out = nn.Linear(hidden_size, output_size)

    self.attn = Attn(attn_model , hidden_size)

  def forward(self, input_step , last_hidden , encoder_outputs ):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden



# **Appendix**

In [None]:


# eval() :                                  Evaluate the given source in the context of globals and locals.
#                                           The source may be a string representing a Python expression or a code object as returned by compile().
#                                           The globals must be a dictionary and locals can be any mapping, defaulting to the current globals and locals.
#                                           If only globals is given, locals defaults to it.

# *****************************************************************************************************

# .strip() :                                Strips binary objects and custom things depending on the argument given

# *****************************************************************************************************

# csv.writer() : sv_writer =                csv.writer(fileobj [, dialect='excel'] [optional keyword args])
#                                           for row in sequence:
#                                             csv_writer.writerow(row)
#                                           [or]
#                                           csv_writer = csv.writer(fileobj [, dialect='excel'] [optional keyword args])
#                                            csv_writer.writerows(rows)

# *****************************************************************************************************
   
# codecs                                    str(*args, **kwargs)
#                                           str(object='') -> str
#                                           str(bytes_or_buffer[, encoding[, errors]]) -> str

#                                           Create a new string object from the given object. If encoding or
#                                           errors is specified, then the object must expose a data buffer
#                                           that will be decoded using the given encoding and error handler.
#                                           Otherwise, returns the result of object.str() (if defined)
#                                           or repr(object).
#                                           encoding defaults to sys.getdefaultencoding().
#                                           errors defaults to 'strict'.

# *****************************************************************************************************

# codecs.decode                             def codecs.decode(obj, encoding='utf-8', errors='strict')
#                                           Decodes obj using the codec registered for encoding.

#                                           Default encoding is 'utf-8'.  errors may be given to set a
#                                           different error handling scheme.  Default is 'strict' meaning that encoding
#                                           errors raise a ValueError.  Other possible values are 'ignore', 'replace'
#                                           and 'backslashreplace' as well as any other name registered with
#                                           codecs.register_error that can handle ValueErrors.

# *****************************************************************************************************

# tqdm.tqdm()                                def tqdm.tqdm(iterable=None, desc=None, total=None, leave=True, file=None, ncols=None, mininterval=0.1, \
#                                            maxinterval=10.0, miniters=None, ascii=None, disable=False, unit='it', unit_scale=False, dynamic_ncols=False, \
#                                            smoothing=0.3, bar_format=None, initial=0, position=None, postfix=None, unit_divisor=1000, write_bytes=None, \
#                                            lock_args=None, gui=False, **kwargs)

#                                            Decorate an iterable object, returning an iterator which acts exactly
#                                            like the original iterable, but prints a dynamically updating
#                                            progressbar every time a value is requested.

# *****************************************************************************************************

# NFD:                                        Normalization Form Canonical Decomposition	Characters are decomposed by canonical equivalence, 
#                                             and multiple combining characters are arranged in a specific order.

# Mn :                                        Nonspacing Mark (Mn)

# *****************************************************************************************************

# re.sub                                      re.sub(pattern, repl, string, count=0, flags=0)
#                                             Return the string obtained by replacing the leftmost
#                                             non-overlapping occurrences of the pattern in string by the
#                                             replacement repl.  repl can be either a string or a callable;
#                                             if a string, backslash escapes in it are processed.  If it is
#                                             a callable, it's passed the match object and must return
#                                             a replacement string to be used.

# *****************************************************************************************************

#nn.module                                    Base class for all neural network modules.
#                                             Your models should also subclass this class.
#                                             Modules can also contain other Modules, allowing to nest them in
#                                             a tree structure. You can assign the submodules as regular attributes:

#                                             import torch.nn as nn
#                                             import torch.nn.functional as F

#							                                class Model(nn.Module):
#							                                  def __init__(self):
#							                                    super(Model, self).__init__()
# 							                                  self.conv1 = nn.Conv2d(1, 20, 5)
# 							                                  self.conv2 = nn.Conv2d(20, 20, 5)
															
# 								                             def forward(self, x):
# 								                                x = F.relu(self.conv1(x))
# 								                                return F.relu(self.conv2(x))
# 								                            Submodules assigned in this way will be registered, and will have their
# 								                            parameters converted too when you call to, etc.


# *****************************************************************************************************



## References

In [None]:
# The code is adopted from https://pytorch.org/tutorials/beginner/chatbot_tutorial.html?highlight=chatbot
# and Tutorials from Udemy 