# Importing necessary libraries

In [8]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pickle
import codecs

# Setting up tokenizer

In [9]:
# creating tokenizer that will produce token(specifically 'words' and thats what we desire to have) 
# which would comprise of words. 
# r"[\w'’-]+" is a regex that will extract words (it would even extract words having apostrophe and hyphen, which
# otherwise would not have happened, had the regex been r'\w+')
tokenizer = RegexpTokenizer(r"[\w'’-]+")

# Reading dataset.csv and storing it into the dataframe named 'df'

In [10]:
# setting up the column names for dataframe that will be created after
# reading 'dataset.csv'
colnames=['Snippet', 'Target'] 

# creating dataframe named 'df'
df = pd.read_csv('dataset.csv', names = colnames, header = None)

# Path to embedding file 
### The embedding file will be used for the creation of word embeddings
### (transformation of words to vector)

In [11]:
embedding_file = './crawl-300d-2M.vec'

# A look at the content of 'crawl-300d-2M.vec'

<img src="contentOfEmbeddingFile.png" style="width:1200px;height:320px;">

# Following function creates embedding index using the embedding file.
## The embedding index will be later used for transforming words to vector

In [12]:
def loadEmbedding():
    
    print('Loading Word Embeddings...')
    
    # 'embeddings_index' will maintain word embedding dictionary
    # the format of the dictionary will be word : word-embedding
    embeddings_index = {}
    
    # opening 'embedding_file' in reading mode
    # When you need to open a file that has a certain encoding, you would use 
    # the codecs module, codecs.open
    f = codecs.open(embedding_file, encoding='utf-8')
    
    # for every line in the file that the file pointer 'f' is pointing to,
    # iterate through the body of the for loop
    for line in f:
        
        # for the current 'line' in  'f', split it into tokens by using ' '(space) as
        # the delimiter.
        # also, trim the whitespaces from the produced tokens
        values = line.rstrip().rsplit(' ')
        
        # index 0 of the list 'values' contains the word
        word = values[0]
        
        # index 1 uptil the last index of the list 'values', we have the corresponding embedding
        coefs = np.asarray(values[1:], dtype='float64')
        
        # put the <word : word-embedding> pair in the dictionary named 'embeddings_index'
        embeddings_index[word] = coefs
    
    # close the file
    f.close()
    
    # the number of word vectors present in the embedding file
    print('Found %s word vectors' % len(embeddings_index))
    
    # returning the created 'embeddings_index'
    return embeddings_index

# Invoking loadEmbedding function to create the emedding-index

In [13]:
# Loading pre-trained embeddings by invoking loadEmbedding function
model = loadEmbedding()

Loading Word Embeddings...
Found 1999996 word vectors


# Lets have a look how some random word(if present in the embedding index) will be represented using word-embedding

In [14]:
# lets say, our random word is 'love'
# following expression will give the word vector assoiated with word 'love'
model['love']

array([-0.2757, -0.0343,  0.1668,  0.0358, -0.0805, -0.0105,  0.0752,
        0.324 ,  0.1245, -0.002 , -0.0176,  0.3054, -0.0302,  0.2219,
       -0.1233, -0.1776, -0.3783, -0.0099, -0.0945,  0.1197, -0.0771,
       -0.2172, -0.0581, -0.2592, -0.0912,  0.207 ,  0.0356,  0.1817,
       -0.1424,  0.0026, -0.2029, -0.0429,  0.0164, -0.3866, -0.0228,
       -0.1913,  0.025 ,  0.0919,  0.1341,  0.28  , -0.1914, -0.0225,
        0.0942, -0.0417, -0.0278, -0.0698, -0.1828, -0.0449,  0.0688,
        0.14  , -0.0579, -0.1856,  0.1317,  0.0861, -0.2081, -0.0542,
        0.0502, -0.0092, -0.1887, -0.0216,  0.0347, -0.269 , -0.1502,
        0.3031,  0.0336,  0.159 ,  0.1326, -0.1242, -0.1914, -0.2266,
        0.1209, -0.0246, -0.1972,  0.3093, -0.0582,  0.1337, -0.0827,
       -0.0721,  0.3924, -0.2054,  0.2582,  0.18  , -0.2151,  0.0502,
       -0.3227,  0.0237, -0.0227,  0.2881,  0.0154, -0.1839, -0.1311,
       -0.0507, -0.157 ,  0.1108, -0.168 , -0.3899, -0.0335, -0.0088,
       -0.1911,  0.1

# Number Of Snippets in our 'dataset.csv'

In [15]:
len(df['Snippet'])

224

# Extract all the words from our 'dataset.csv'

In [16]:
# the list 'all_words' will maintain all the words present in 'dataset.csv' 
all_words=[]

# iterate over all the snippets, tokenize them and store the tokens(i.e. words)
for i in range(len(df['Snippet'])):
    
    # tokenizer.tokenize(df['Snippet'][i]) generates the list of tokens(words) contained 
    # in the i'th snippet.
    
    # list of tokens that are generated for i'th snippet, will be appended to the 'all_word' list
    all_words.extend(tokenizer.tokenize(df['Snippet'][i]))

# A look at 'all_word' list

In [17]:
all_words

['I',
 'frowned',
 'at',
 'him',
 "Isn't",
 'sarcasm',
 'the',
 'opiate',
 'of',
 'the',
 'masses',
 "You're",
 'thinking',
 'of',
 'religion',
 'he',
 'replied',
 'Sarcasm',
 'is',
 'the',
 'Xanax',
 'of',
 'the',
 'morally',
 'bereft',
 'Fine',
 'it’s',
 'decided',
 'Vain',
 'resumed',
 'his',
 'stride',
 'I’ll',
 'do',
 'all',
 'the',
 'killing',
 'and',
 'you',
 'do',
 'all',
 'the',
 'stuff',
 'that',
 'an',
 'Avun-Riah',
 'does',
 'Whatever',
 'the',
 'hell',
 'that',
 'is',
 'What',
 'time',
 'is',
 'it',
 'One',
 "o'clock",
 'I',
 'nearly',
 'spit',
 'out',
 'some',
 'soda',
 'In',
 'the',
 'afternoon',
 'No',
 'In',
 'the',
 'morning',
 "Don't",
 'let',
 'that',
 'damn',
 'sunlight',
 'fool',
 'you',
 'It',
 'lies',
 'I',
 'understand',
 'that',
 'you',
 'don’t',
 'want',
 'to',
 'marry',
 'me',
 'I',
 'said',
 'I',
 'mean',
 'I',
 'don’t',
 'know',
 'why',
 'since',
 'I’m',
 'simply',
 'delightful',
 'to',
 'be',
 'around',
 'But',
 'to',
 'each',
 'his',
 'own',
 'taste',
 '

# What is the length of 'all_word' list?
## Note: 'all_word' will even be maintaining the presence of duplicate words

In [18]:
len(all_words)

5961

# Lets remove the presence of duplicate words, if any

In [19]:
all_words=list(dict.fromkeys(all_words))

# A look at 'all_word' list
## Note: 'all_word' list no longer peserves the duplicacy of words

In [20]:
all_words

['I',
 'frowned',
 'at',
 'him',
 "Isn't",
 'sarcasm',
 'the',
 'opiate',
 'of',
 'masses',
 "You're",
 'thinking',
 'religion',
 'he',
 'replied',
 'Sarcasm',
 'is',
 'Xanax',
 'morally',
 'bereft',
 'Fine',
 'it’s',
 'decided',
 'Vain',
 'resumed',
 'his',
 'stride',
 'I’ll',
 'do',
 'all',
 'killing',
 'and',
 'you',
 'stuff',
 'that',
 'an',
 'Avun-Riah',
 'does',
 'Whatever',
 'hell',
 'What',
 'time',
 'it',
 'One',
 "o'clock",
 'nearly',
 'spit',
 'out',
 'some',
 'soda',
 'In',
 'afternoon',
 'No',
 'morning',
 "Don't",
 'let',
 'damn',
 'sunlight',
 'fool',
 'It',
 'lies',
 'understand',
 'don’t',
 'want',
 'to',
 'marry',
 'me',
 'said',
 'mean',
 'know',
 'why',
 'since',
 'I’m',
 'simply',
 'delightful',
 'be',
 'around',
 'But',
 'each',
 'own',
 'taste',
 'How',
 'knowing',
 'em',
 "'things",
 'could',
 "worse'",
 'than',
 'what',
 'already',
 'deem',
 'awful',
 'make',
 'feel',
 'any',
 'better',
 'You',
 'sink',
 'even',
 'lower',
 'Oh',
 'joy',
 'when',
 'you’re',
 'in

# What is the length of 'all_word' list after removing duplicates?

In [21]:
len(all_words)

2066

# Converting every word in the 'all_word' list to lowercase

In [22]:
all_words=[x.lower() for x in all_words]

# A look at 'all_word' list after every word has been converted to lowercase

In [23]:
all_words

['i',
 'frowned',
 'at',
 'him',
 "isn't",
 'sarcasm',
 'the',
 'opiate',
 'of',
 'masses',
 "you're",
 'thinking',
 'religion',
 'he',
 'replied',
 'sarcasm',
 'is',
 'xanax',
 'morally',
 'bereft',
 'fine',
 'it’s',
 'decided',
 'vain',
 'resumed',
 'his',
 'stride',
 'i’ll',
 'do',
 'all',
 'killing',
 'and',
 'you',
 'stuff',
 'that',
 'an',
 'avun-riah',
 'does',
 'whatever',
 'hell',
 'what',
 'time',
 'it',
 'one',
 "o'clock",
 'nearly',
 'spit',
 'out',
 'some',
 'soda',
 'in',
 'afternoon',
 'no',
 'morning',
 "don't",
 'let',
 'damn',
 'sunlight',
 'fool',
 'it',
 'lies',
 'understand',
 'don’t',
 'want',
 'to',
 'marry',
 'me',
 'said',
 'mean',
 'know',
 'why',
 'since',
 'i’m',
 'simply',
 'delightful',
 'be',
 'around',
 'but',
 'each',
 'own',
 'taste',
 'how',
 'knowing',
 'em',
 "'things",
 'could',
 "worse'",
 'than',
 'what',
 'already',
 'deem',
 'awful',
 'make',
 'feel',
 'any',
 'better',
 'you',
 'sink',
 'even',
 'lower',
 'oh',
 'joy',
 'when',
 'you’re',
 'in

# Now, we'll be creating the word vector corresponding to every word present in the 'all_word' list

In [24]:
# 'embeddings' dictionary will maintain the word from the 'all_word' list and the 
# corresponding word vector.
# if 'embedding_index' / 'model' represents the universe of the word and the 
# corresponding word embedding, then 'embeddings' represents the subset of that 
# universe of word embeddings
embeddings = {}

# iterate through all the words present in the 'all_word' list
for word in all_words:
    
    # if the current 'word' is not in the 'model.keys()', it means such 
    # word has no word vector representation in the universe of word embedding i.e
    # 'model' / 'embedding_index'.
    
    # and, if the such word has no vector in the in the universe of word embedding, its
    # always better to reprsent them using the word vector of 'unk'('unk' is anlogous to unknown).
    
    # you can think of 'unk' as the category of words whose vector represntation is not
    # present in the 'model' / 'embedding_index'
    if word not in model.keys():
        
        # assigning the vector representation of 'unk' to the current 'word'
        # i.e creating <word, vector representation of 'unk'> pair for the current 'word' 
        embeddings[word] = model['unk']
        
    else:
        
        # if the current 'word' is in the 'model.keys()', it means such 
        # word has a word vector representation in the universe of word embedding i.e
        # 'model' / 'embedding_index'.
        
        # creating <'word', vector representation of 'word'> pair for the current 'word'
        embeddings[word] = model[word]

# Reading pre_processed_dataset.csv and storing it into the dataframe named 'ppd'

In [25]:
ppd=pd.read_csv('pre_processed_dataset.csv',encoding='utf-8')

# Finding out the length(in terms of words) of left context associated with the candidate word present in 'pre_processed_dataset.csv'

In [26]:
# list 'length_l' will maintain the length(in terms of words) of the left context 
# associated with the probable candidate word  
length_l=[]

# iterate through the all the left contexts associated with the i'th snippet in the
# 'pre_processed_dataset.csv'
for i in range(len(ppd['left_context'])):
    # tokenizer.tokenize(ppd['left_context'][i]) tokenizes the left context of the i'th
    # snippet.
    
    # len(tokenizer.tokenize(ppd['left_context'][i])) calculates the length of the left 
    # context of the i'th snippet in 'pre_processed_dataset.csv'.
    
    # and the calculated length is further appended to the list 'length_l'
    length_l.append(len(tokenizer.tokenize(ppd['left_context'][i])))

# Finding out the length(in terms of words) of right context associated with the candidate word present in 'pre_processed_dataset.csv'

In [27]:
# list 'length_r' will maintain the length(in terms of words) of the right context 
# associated with the probable candidate word  
length_r=[]

# iterate through the all the right contexts associated with the i'th snippet in the
# 'pre_processed_dataset.csv'
for i in range(len(ppd['right_context'])):
    # tokenizer.tokenize(ppd['right_context'][i]) tokenizes the right context of the i'th
    # snippet.
    
    # len(tokenizer.tokenize(ppd['right_context'][i])) calculates the length of the right 
    # context of the i'th snippet in 'pre_processed_dataset.csv'.
    
    # and the calculated length is further appended to the list 'length_r'
    length_r.append(len(tokenizer.tokenize(ppd['right_context'][i])))

# Embedding associated with padding

In [28]:
embeddings['<pad>']= [0]*300

# The peculiar behaviour of 'model' / 'embedding_index'

In [29]:
# if the following expression is executed then it displays the
# vector representation associated with 'how.'
model['how.']

array([-0.0645,  0.276 , -0.5165,  0.058 , -0.0833, -0.209 ,  0.4805,
        0.107 , -0.3251,  0.158 ,  0.4479,  0.413 , -0.284 ,  0.0043,
        0.3778, -0.0591,  0.0157,  0.0095,  0.2918, -0.0615,  0.1472,
        0.1333,  0.0458,  0.1946, -0.2652, -0.0534, -0.3718, -0.2466,
       -0.1345,  0.1599,  0.0038,  0.074 , -0.4212, -0.1572, -0.6868,
       -0.0051, -0.09  , -0.5533, -0.2124,  0.5763, -0.5189,  0.7555,
        0.1353, -0.0086,  0.1698,  0.1511,  0.0341, -0.1328, -0.3859,
        0.242 ,  0.2226, -0.2057,  0.2238, -0.0616,  0.3288, -0.3321,
       -0.3493, -0.1135, -0.1579, -0.1348, -0.144 ,  0.4702,  0.0759,
       -0.0545,  0.1857,  0.3709, -0.036 , -0.1764, -0.0319, -0.2919,
       -0.4188,  0.3224, -0.0896,  0.0538,  0.2322, -0.0938, -0.6189,
        0.4253, -0.0473, -0.0551,  0.399 , -0.043 , -0.4988, -0.2166,
       -0.0637, -0.1231,  0.2262, -0.1917,  0.1957,  0.0893, -0.2451,
        0.4929, -0.0277, -0.1668, -0.1255,  0.0314,  0.2761, -0.0425,
        0.1001, -0.2

In [30]:
# but, if I run the following expression, the model generates key error.
# why ?
# probably beacuse the word 'how?' is not present in the 'model' and 
# hence, no vector representation
model['how?']

KeyError: 'how?'

# But, what makes me point at that peculiarity?

In [None]:
# to be continued

# Why use tokenize() and not split()?

## 1. split()

In [34]:
# for sentence "hey! isn't it good!!!?? ....", lets see what split() does
tokens_from_split = ("hey! isn't it good!!!?? ....").split()

# Number of tokens generated using split()
number_of_tokens = len(tokens_from_split)

print("Tokens Obtained After Using split() : ", tokens_from_split)
print("Number of tokens generated using split() : ", number_of_tokens)

Tokens Obtained After Using split() :  ['hey!', "isn't", 'it', 'good!!!??', '....']
Number of tokens generated using split() :  5


# 2. tokenize()

In [35]:
# for sentence "hey! isn't it good!!!?? ....", lets see what tokenize() does
tokens_from_tokenize = tokenizer.tokenize("hey! isn't ,,, it good!!!?? ....")

# Number of tokens generated using split()
number_of_tokens = len(tokens_from_tokenize)

print("Tokens Obtained After Using tokenize() : ", tokens_from_tokenize)
print("Number of tokens generated using tokenize() : ", number_of_tokens)

Tokens Obtained After Using tokenize() :  ['hey', "isn't", 'it', 'good']
Number of tokens generated using tokenize() :  4


### Note that the tokens generated from tokenize using previously defined regex expression makes more sense than simply using split() whose default delimiter is ' '(a whitespace).
### Why they make more sense?
### Words without punctuation are highly likely to be present in the 'model' than the words with several punctuations.

# Creation of the embedding for left context

In [36]:
# following list will maintain the embedding of left context associated with candidate 
# word present in the i'th snippet in 'pre_processed_dataset.csv' at the i'th index
keras_left_context = []

# iterate over all the left contexts of the snippets present in the
# dataframe 'ppd'('ppd' was created from 'pre_processed_dataset.csv')
for i in range(len(ppd['left_context'])):
    
    # one_vector will maintain the embedding for the left context associated with 
    # i'th snippet
    one_vector = []
    
    # temp stores the list of tokens generated after tokenizing the left context 
    # of the i'th snippet
    temp = tokenizer.tokenize(ppd['left_context'][i])
    
    # as given in the paper, the text will be appended with <start> tag in the beginning
    # to mark the starting of the text
    
    # therfore, lets append the vector representation of <start> to the list 'one_vector',
    # which maintains the embedding of the left context
    one_vector.append(model['start'])
    
    # for every token(except the one present at 0'th index) present in the 'temp' list,
    # iterate
    for m in temp[1:]:
        try:
            
            # extract the embedding associated with the token represented by 'm' in 'temp' list
            # and, store it in 'a'
            a = embeddings[m.lower()]
        
        except KeyError:
            
            # if the token is not present in the 'embeddings' dictionary,
            # then give that token a vector representation of unknown i.e 'unk'
            a = model['unk']
        
        # append the embeddings associated with token 'm' in 'one_vector' list
        one_vector.append(a)
     
    # for different snippets the size of left context(i.e the number of tokens present in the left context) 
    # will be different.
    
    # and, if you recollect the description of the model from the paper, you must understand that for the snippet,
    # [left context, candidate word, right context] is what is going to be passed to the RNN's.
    # therefore, we need to agree on some bare minimum size(78 is considered to be the are minimum size here) of the 
    # vector representation of left context
    
    # if the number of words in the left context is less than 78, then the representation of left context will 
    # padded with the 78 - length_l[i] vector representation of '<pad>'
    one_vector.extend([embeddings['<pad>'] for x in range(78-length_l[i])])
    
    # append the embedding of the left context associated with the i'th snippet to 
    # the list 'keras_left_context'
    keras_left_context.append(one_vector)

# How many left context embeddings are present in the list 'keras_left_context' ?

In [37]:
len(keras_left_context)

5913

### And, you should not be surprised to see 5913, coz the number of snippets in 'pre_processed_dataset.csv' is 5913.
### And, associated with every snippet, we've the left context. Hence, 5913.

# Lets investigate the embedding of the left context of the snippet present at 0'th index in the data frame 'ppd

In [38]:
keras_left_context[0]

[array([-4.900e-03,  1.182e-01,  1.624e-01,  2.690e-02,  4.950e-02,
         2.310e-02, -7.790e-02, -1.406e-01,  2.031e-01, -2.450e-02,
        -1.220e-01,  1.272e-01,  2.530e-02, -7.300e-02, -1.001e-01,
        -8.230e-02,  1.466e-01,  6.970e-02,  3.773e-01,  9.440e-02,
        -1.252e-01, -1.066e-01,  1.470e-02,  8.350e-02, -1.175e-01,
        -3.750e-02,  4.070e-02, -1.713e-01,  1.235e-01,  2.936e-01,
        -2.821e-01, -2.040e-01, -2.032e-01, -2.082e-01, -2.530e-02,
        -1.780e-02, -4.300e-02, -9.650e-02,  2.116e-01,  3.493e-01,
         1.191e-01,  4.514e-01,  2.895e-01, -2.665e-01,  6.600e-02,
        -1.805e-01,  3.117e-01, -9.760e-02, -2.016e-01,  1.777e-01,
        -3.787e-01, -1.879e-01, -8.440e-02,  1.550e-02, -1.629e-01,
        -1.199e-01,  3.950e-02, -3.697e-01,  8.460e-02, -1.583e-01,
        -1.043e-01, -3.539e-01,  9.350e-02, -6.020e-02, -6.040e-02,
         1.600e-02, -2.639e-01,  6.630e-02,  1.417e-01, -3.418e-01,
        -5.940e-02,  6.310e-02, -1.508e-01,  2.4

### If you go through the above produced output carefully, you'll understand that there are 77 embeddings of 'pad' and one embedding of 'start'.

### Looking at the embedding of the left context associated with 0'th snippet in the data frame 'ppd', one can say that each left context embedding is the collection of 78 vectors and each vector in turn is of size 300.

# Creation of the embedding for left context

In [39]:
# following list will maintain the embedding of right context associated with candidate 
# word present in the i'th snippet in 'pre_processed_dataset.csv' at the i'th index
keras_right_context=[]

# iterate over all the right contexts of the snippets present in the
# dataframe 'ppd'('ppd' was created from 'pre_processed_dataset.csv')
for i in range(len(ppd['right_context'])):
    
    # one_vector will maintain the embedding for the right context associated with 
    # i'th snippet
    one_vector=[]
    
    # temp stores the list of tokens generated after tokenizing the right context 
    # of the i'th snippet
    temp=tokenizer.tokenize(ppd['right_context'][i])
    
    # for every token(except the last present at -1'th index i.e. last index) present in the 'temp' list,
    # iterate
    for m in temp[:-1]:
        
        try:
            
            # extract the embedding associated with the token represented by 'm' in 'temp' list
            # and, store it in 'a'
            a = embeddings[m.lower()]
        
        except keyError:
            
            # if the token is not present in the 'embeddings' dictionary,
            # then give that token a vector representation of unknown i.e 'unk'
            a = model['unk']
        
        # append the embeddings associated with token 'm' in 'one_vector' list
        one_vector.append(a)
    
    # as given in the paper, the text will be appended with <end> tag in the end
    # to mark the ending of the text
    
    # therfore, lets append the vector representation of <end> to the list 'one_vector',
    # which maintains the embedding of the right context
    one_vector.append(model['end'])
    
    # if the number of words in the right context is less than 78, then the representation of right context will 
    # padded with the (78 - length_r[i]) vector representation of '<pad>'
    one_vector.extend([embeddings['<pad>'] for x in range(78-length_r[i])])
    
    # append the embedding of the right context associated with the i'th snippet to 
    # the list 'keras_right_context'
    keras_right_context.append(one_vector)

# How many right context embeddings are present in the list 'keras_right_context' ?

In [40]:
len(keras_right_context)

5913

### And, you should not be surprised to see 5913, coz the number of snippets in 'pre_processed_dataset.csv' is 5913.
### And, associated with every snippet, we've the right context. Hence, 5913.

# Lets investigate the embedding of the right context of the snippet present at 0'th index in the data frame 'ppd'

In [41]:
keras_right_context[0]

[array([ 5.079e-01, -3.190e-02, -6.280e-02, -5.522e-01, -5.604e-01,
         1.172e-01,  1.749e-01,  5.100e-02, -7.208e-01, -2.383e-01,
         6.340e-02,  6.465e-01,  5.585e-01, -2.965e-01,  9.850e-02,
        -1.035e-01,  8.050e-02, -7.610e-02,  1.044e-01,  6.540e-02,
         2.407e-01,  4.986e-01,  1.315e-01,  1.698e-01, -1.762e-01,
        -4.470e-02,  3.908e-01,  3.057e-01,  1.018e-01,  4.668e-01,
         3.002e-01, -3.408e-01,  2.960e-01,  3.034e-01,  1.530e-01,
         5.560e-02, -3.150e-02, -2.057e-01, -2.712e-01,  2.260e-02,
        -1.096e-01, -5.245e-01, -3.657e-01,  2.843e-01, -1.600e-02,
         3.214e-01,  5.020e-02, -2.276e-01, -1.826e-01, -4.747e-01,
        -5.418e-01, -1.856e-01,  1.982e-01, -2.190e-01,  1.888e-01,
         5.659e-01, -1.295e-01,  3.655e-01,  6.540e-02,  4.257e-01,
         4.786e-01, -5.446e-01,  1.566e-01,  3.837e-01, -4.960e-02,
         7.660e-02, -2.869e-01, -3.879e-01,  4.660e-02, -1.294e-01,
        -3.938e-01, -7.880e-02,  3.354e-01,  5.6

### Looking at the embedding of the right context associated with 0'th snippet in the data frame 'ppd', one can say that each right context embedding is the collection of 78 vectors and each vector in turn is of size 300.

# Creation of the Embedding of the candidate word

In [42]:
# keras_middle will hold the embedding associated with the candidate word of every snippet 
# present in the data frame 'ppd' 
keras_middle=[]

# for every candidate word associated with the i'th snippet present in the data frame 'ppd',
# iterate
for i in range(len(ppd['Candidate_words'])):
    
    # ppd['Candidate_words'][i] - represents candidate word of the i'th snippet contained in the 
    # data frame named 'ppd'
    
    # ppd['Candidate_words'][i].lower() - candidate word is changed into its lowercase representation
    
    # embeddings[ppd['Candidate_words'][i].lower()] - generates the vector representation of the 
    # candidate word associated with i'th snippet of the dataframe 'ppd'
    
    # once the embedding of the candidate word has been created , append it to the list 'keras_middle'
    keras_middle.append(embeddings[ppd['Candidate_words'][i].lower()])

KeyError: 'him.'

### The question is why the 'KeyError' was thrown. That means in the original data set 'him' along with '.' was made the candidate word for whatsoever snippet it belongs to. And as mentioned before, the model is highly likely to produce the vector representation for word without any punctuations than the word with punctuation.

### The above reasoning pushes us to use tokenize() function, so that we just get word from the candidate word and we need not worry about punctuations. And thus, it becomes highly likely that the word and the corresponding embedding will be found in the 'embeddings' list.

## Lets tokenize the candidate word and see what happens

In [43]:
# keras_middle will hold the embedding associated with the candidate word of every snippet 
# present in the data frame 'ppd' 
keras_middle=[]

# for every candidate word associated with the i'th snippet present in the data frame 'ppd',
# iterate
for i in range(len(ppd['Candidate_words'])):
    
    # ppd['Candidate_words'][i] - represents candidate word of the i'th snippet contained in the 
    # data frame named 'ppd'
    
    # ppd['Candidate_words'][i].lower() - candidate word is changed into its lowercase representation
    
    # tokenizer.tokenize(ppd['Candidate_words'][i].lower())- generates the candidate word that wont be
    # having any punctuations attached to it
    
    # candidate word without any punctuations
    candidate = tokenizer.tokenize(ppd['Candidate_words'][i].lower())
    
    # printing candidates word
    print(candidate)

['i']
['frowned']
['at']
['him']
["isn't"]
['sarcasm']
['the']
['opiate']
['of']
['the']
['masses']
["you're"]
['thinking']
['of']
['religion']
['he']
['replied']
['sarcasm']
['is']
['the']
['xanax']
['of']
['the']
['morally']
['bereft']
['fine']
['it’s']
['decided']
['vain']
['resumed']
['his']
['stride']
['i’ll']
['do']
['all']
['the']
['killing']
['and']
['you']
['do']
['all']
['the']
['stuff']
['that']
['an']
['avun-riah']
['does']
['whatever']
['the']
['hell']
['that']
['is']
['what']
['time']
['is']
['it']
['one']
["o'clock"]
['i']
['nearly']
['spit']
['out']
['some']
['soda']
['in']
['the']
['afternoon']
['no']
['in']
['the']
['morning']
["don't"]
['let']
['that']
['damn']
['sunlight']
['fool']
['you']
['it']
['lies']
['i']
['understand']
['that']
['you']
['don’t']
['want']
['to']
['marry']
['me']
['i']
['said']
['i']
['mean']
['i']
['don’t']
['know']
['why']
['since']
['i’m']
['simply']
['delightful']
['to']
['be']
['around']
['but']
['to']
['each']
['his']
['own']
['taste']
['

['the']
['moment']
['it']
["wasn't"]
['him']
['are']
['you']
['a']
['house-wife']
['mrs']
['silvers', "'"]
['he']
['asked']
["'what"]
['would']
['you']
['recommend']
['for']
['getting']
['burger']
['relish']
['out']
['of']
['a']
['white']
['shirt', "'"]
['the']
['seething']
['woman']
['cranked']
['the']
['venom-level']
['of']
['her']
['gaze']
['up']
['to']
['eleven']
['and']
['raven']
['smiled']
['pleasantly']
['back']
['i']
['don’t']
['get']
['you']
['people']
['you']
['watch']
['the']
['godfather']
['on']
['television']
['and']
['tons']
['of']
['people']
['are']
['getting']
['shot']
['and']
['stabbed']
['to']
['death']
['blood']
['splattering']
['everywhere']
['and']
['it']
['is']
['entertaining']
['but']
['when']
['they']
['killed']
['a']
['horse']
['people']
['were']
['outraged']
['social']
['media']
['has']
['turned']
['all']
['of']
['us']
['into']
['dogs']
['the']
['moment']
['a']
['single']
['dog']
['is']
['unhappy']
['with']
['something']
['it']
['starts']
['barking']
['and']
[

["eisenhower's"]
['internal']
['question']
['as']
['to']
['whether']
['patton']
['wears']
['his']
['ever-present']
['helmet']
['to']
['bed']
['he']
['made']
['a']
['sound']
['of']
['disgust']
['in']
['the']
['back']
['of']
['his']
['throat']
['oh']
['thank']
['you']
['so']
['much']
["that's"]
['what']
['every']
['man']
['wants']
['to']
['hear']
['about']
['his']
['name']
['you']
['might']
['as']
['well']
['call']
['me']
["'little"]
["pecker'"]
['while']
["you're"]
['at']
['it']
['and']
['tell']
['me']
['you']
['would']
['love']
['to']
['have']
['me']
['go']
['shopping']
['with']
['you']
['for']
['feminine']
['hygiene']
['products']
['oh']
['and']
['by']
['all']
['means']
['carry']
['a']
['big']
['sparkling']
['pink']
['bag']
['with']
['flowers']
['on']
['it']
['and']
['make']
['me']
['hold']
['it']
['owr']
['brave']
['little']
['shank']
['go']
['make']
['love']
['to']
['a']
['tube']
['sock']
['a']
["woman's"]
['weapon']
['is']
['her']
['tongue']
['you']
['are']
['in']
['good']
['shape'

### Now that we have candidates word without any punctuations,we are good to go to get the associated embedding

In [44]:
# keras_middle will hold the embedding associated with the candidate word of every snippet 
# present in the data frame 'ppd' 
keras_middle=[]


# for every candidate word associated with the i'th snippet present in the data frame 'ppd',
# iterate
for i in range(len(ppd['Candidate_words'])):
    
    # ppd['Candidate_words'][i] - represents candidate word of the i'th snippet contained in the 
    # data frame named 'ppd'
    
    # ppd['Candidate_words'][i].lower() - candidate word is changed into its lowercase representation
    
    # tokenizer.tokenize(ppd['Candidate_words'][i].lower())- generates the candidate word that wont be
    # having any punctuations attached to it
    
    # candidate word without any punctuations
    candidate = tokenizer.tokenize(ppd['Candidate_words'][i].lower())
    
    # lets have a look at the embedding of the candidate word
    print(embeddings[candidate])

TypeError: unhashable type: 'list'

## TypeError: unhashable type: 'list'. Why?
## Note:  TypeError: unhashable type: 'list' usually means that you are trying to use a list as an hash argument.
## Did you notice square brackets(representing that its a list) while we printed the candidate words after tokeninzing them?\
## The dictionary 'embedding' is expecting a string as a key and not some list

## Let us see how many candidate words are there whose vector representation can be fetched from the 'embedding' dictionary

In [45]:
# keras_middle will hold the embedding associated with the candidate word of every snippet 
# present in the data frame 'ppd'
keras_middle=[]

# 'c' represents the number of candidates word whose vector representation was found in the 
# 'embeddings' dictionary
c = 0

# for every candidate word associated with the i'th snippet present in the data frame 'ppd',
# iterate
for i in range(len(ppd['Candidate_words'])):
    
    # ppd['Candidate_words'][i] - represents candidate word of the i'th snippet contained in the 
    # data frame named 'ppd'
    
    # ppd['Candidate_words'][i].lower() - candidate word is changed into its lowercase representation
    
    # tokenizer.tokenize(ppd['Candidate_words'][i].lower())- generates the candidate word that wont be
    # having any punctuations attached to it
    
    # but the candidate word that was produced without any punctuations is actually a list
    # and we've seen that before
    
    # converting candidate word from list to string using join() function
    candidate = " ".join(tokenizer.tokenize(ppd['Candidate_words'][i].lower()))
    
    # check if the candidate word is in the dictionary or not
    if candidate in embeddings:
        
        # if candidate word is in the 'embeddings' dictionary, increment the count of 'c'
        c = c + 1

print("Number of words for which embedding was found : ", c)

Number of words for which embedding was found :  5862


### But there are 5913 snippets in the dataframe 'ppd'. So, logically, we should be having 5913 candidate words and 5913 corresponding vector representations.

### But our 'embeddings' dictionary has knowledge about just 5862 words. Remaining 51 words, its unaware of.
### So, those 51 words will be given the vector represenation of unknown which has been previously represented as 'unk'.

# A final attempt to produce the embeddings for the candidate words after we have faced different errors

In [46]:
# keras_middle will hold the embedding associated with the candidate word of every snippet 
# present in the data frame 'ppd'
keras_middle=[]

# for every candidate word associated with the i'th snippet present in the data frame 'ppd',
# iterate
for i in range(len(ppd['Candidate_words'])):
    
    # ppd['Candidate_words'][i] - represents candidate word of the i'th snippet contained in the 
    # data frame named 'ppd'
    
    # ppd['Candidate_words'][i].lower() - candidate word is changed into its lowercase representation
    
    # tokenizer.tokenize(ppd['Candidate_words'][i].lower())- generates the candidate word that wont be
    # having any punctuations attached to it
    
    # but the candidate word that was produced without any punctuations is actually a list
    # and we've seen that before
    
    # converting candidate word from list to string using join() function
    candidate = " ".join(tokenizer.tokenize(ppd['Candidate_words'][i].lower()))
    

    if candidate in embeddings:
        
        # if candidate word is in the 'embeddings' dictionary, then append
        # the corresponding vector representation to the 'keras_middle' list
        keras_middle.append(embeddings[candidate])
    
    else:
        
        # if candidate word is nt in the 'embeddings' dictionary, then append
        # the vector representation of unknown i.e 'unk' to the 'keras_middle' list
        keras_middle.append(model['unk'])

# Dump the processed dataset in a pickle file

In [47]:
# opening the file for writing in binary mode
f = open(b"Data_fast.pkl","wb")

# Saving the processed dataset in a pickle file
pickle.dump(zip(keras_left_context,keras_right_context,keras_middle,ppd['target_status']),f)

# Shape of 'keras_left_context'
# Can you decode as to what each dimensions of the shape represent ? 

In [48]:
np.shape(keras_left_context)

(5913, 78, 300)

# Shape of 'keras_right_context'
# Can you decode as to what each dimensions of the shape represent ? 

In [49]:
np.shape(keras_right_context)

(5913, 78, 300)

# Shape of 'keras_middle'
# Can you decode as to what each dimensions of the shape represent ? 

In [50]:
np.shape(keras_middle)

(5913, 300)