**Ronaldlee Ejalu**

**CSC 583**

**HW#5** 

# Part 11

## Task 1

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import os
import nltk
nltk.download("punkt")
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
import itertools
torch.manual_seed(1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<torch._C.Generator at 0x7f5b1c22c890>

In [None]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
"""
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = (
        [raw_text[i - j - 1] for j in range(CONTEXT_SIZE)]
        + [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)]
    )
"""
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = (
        [raw_text[i - CONTEXT_SIZE], raw_text[i - (CONTEXT_SIZE - 1)], \
              raw_text[i + (CONTEXT_SIZE - 1)], raw_text[i + CONTEXT_SIZE]]
    )
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
      super(CBOW, self).__init__()
      self.embeddings = nn.Embedding(vocab_size, embedding_dim)
      self.linear1 = nn.Linear(embedding_dim, 128) # hidden layer
      self.activation_function1 = nn.ReLU() # activation layer
      self.linear2 = nn.Linear(128, vocab_size) # another hidden layer
      # Applies the log(Softmax(x)) function to an n-dimensional input Tensor.
      self.activation_function2 = nn.LogSoftmax(dim = -1) # activation layer


    def forward(self, inputs):
      embeds = sum(self.embeddings(inputs)).view(1, -1)
      out = self.linear1(embeds)
      out = self.activation_function1(out)
      out = self.linear2(out)
      out = self.activation_function2(out)
      return out

# Create your model and train. Here are some functions to help you make
# the data ready for use by your module.

def make_context_vector(context, word_to_ix):
  idxs = [word_to_ix[w] for w in context]
  return torch.tensor(idxs, dtype=torch.long)

make_context_vector(data[0][0], word_to_ix)  # example

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


tensor([15, 24,  5, 44])

In [None]:
losses = []
loss_function = nn.NLLLoss()
EMBEDDING_DIM = 10
print('The vocabulary size is %d.' %vocab_size)
model = CBOW(vocab_size, EMBEDDING_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in data:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
print("The embedding vector for 'procecess' is:")
print(model.embeddings.weight[word_to_ix["processes"]])

The vocabulary size is 49.
[230.22597670555115, 225.9807469844818, 221.90891313552856, 218.00054550170898, 214.24484992027283, 210.63149189949036, 207.1489531993866, 203.79080986976624, 200.54813313484192, 197.41282880306244]
The embedding vector for 'procecess' is:
tensor([ 2.4161,  1.0208, -0.4396, -1.7347, -1.2398,  1.5813, -1.1160,  0.7683,
        -0.5879,  2.1180], grad_fn=<SelectBackward0>)


In [None]:
process_idx = word_to_ix['processes']
processes_tensor = torch.tensor(process_idx, dtype=torch.long)
process_similar = {}
cos = nn.CosineSimilarity(dim=0)
for i in range(len(data)):
  item = data[i][0]
  for num in range(len(item)):
    word_str = 'processes' + '-' + item[num] 
    if item[num] == 'processes':
      pass
    else:
      output = cos(model.embeddings.weight[word_to_ix["processes"]], model.embeddings.weight[word_to_ix[item[num]]])
      res = output.detach().numpy()
      ts = res.tobytes() 
      arr = np.frombuffer(ts, dtype=res.dtype) # we change to an array
      process_similar[word_str] =  arr[0]

In [None]:
# convert the dictionary into pandas data frame
df_cosineSimProcesses = pd.DataFrame(process_similar.items(), columns=['similar-words', 'cosine_value'])

In [None]:
# sort the data frame
df_cosineSimProcesses_Sorted = df_cosineSimProcesses.sort_values(by='cosine_value', ascending=False)
print("The top three words that are closest to 'processes' by cosine similarity:")
df_cosineSimProcesses_Sorted.head(3)

The top three words that are closest to 'processes' by cosine similarity:


Unnamed: 0,similar-words,cosine_value
42,processes-conjure,0.564105
18,processes-they,0.469889
45,processes-with,0.415042


## Task 11

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
%cd /gdrive/MyDrive/CSC583

/gdrive/MyDrive/CSC583


In [None]:
directoryPath_pos = '/gdrive/MyDrive/CSC583/data/homework5/txt_sentoken/pos/pos'

In [None]:
print('There are %d files in the pos directory.' %len(os.listdir(directoryPath_pos)))

There are 1000 files in the pos directory.


In [None]:
file_L = ['cv199_9629.txt', 'cv261_10954.txt', 'cv315_11629.txt', \
          'cv368_10466.txt', 'cv401_12605.txt', 'cv453_10379.txt', \
          'cv519_14661.txt', 'cv729_10154.txt', 'cv782_19526.txt', 'cv900_10331.txt']

Read all the reviews files and store them in a list

In [None]:
reviews_L = []
for num in range(len(file_L)):
  fileNamePath = "data/homework5/txt_sentoken/pos/pos/" + file_L[num]
  with open(fileNamePath, 'r',  encoding = 'ISO-8859-1') as sampleFile:
    fileContents = sampleFile.read()
    # normalize all words to lower case
    reviews_L.append(fileContents.lower())

In [None]:
len(reviews_L)

10

In [None]:
def populateWordFrequency(filtered_reviews_200):
  """Function that a list afte tokening the text, processing the 
  individual words and then derives the frequency of the words 
  """
  # reset the dictionary
  wordFrequency = {}

  # populate the dictionary with the frequecny of words
  for i in range(len(filtered_reviews_200)):
    item = filtered_reviews_200[i]
    if item in wordFrequency:
      wordFrequency[item] += 1
    else:
      wordFrequency[item] = 1

  # sort the dictionary and create a list of tuples
  wordSorted = sorted((value, key) for (key, value) in wordFrequency.items())
  sortedWordFrequency = [(key, value) for (value, key) in wordSorted]

  # create a data frame
  df = pd.DataFrame(sortedWordFrequency, columns=['Word', 'Freq'])
  # sort the data frame in descending order
  df.sort_values(by=['Freq'], ascending=False, inplace=True)
  # reset the index
  df.reset_index(drop=True, inplace=True)
  df.index = np.arange(1, len(df) + 1)
  df.head()
  
  #return df.head(n).values.tolist()
  return df.values.tolist()

In [None]:
# filter stop words with the nltk
stopWords = set(stopwords.words('english'))
# create a tokenizer based on a regular expression.
# '[a-zA-Z0-9]+' captures all alphanumeric characters
tokenizer = RegexpTokenizer(r"[a-zA-Z0-9]+")
filtered_L = [] # list that stores all the reviews of the different tokens
for i in range(len(reviews_L)):
  filtered_reviews = []
  tokenized_review = tokenizer.tokenize(reviews_L[i])
  for tokenized_w in tokenized_review:
    if tokenized_w not in stopWords:
      # add the filtered tokens to a list
      filtered_reviews.append(tokenized_w)
  # take the first 200 words in the order of sequence and them to the list of list
  filtered_L.append(filtered_reviews[0:200])
#len(filtered_L[1])
# merge all the reviews together
merged_reviews = list(itertools.chain.from_iterable(filtered_L))
print('There are %d tokens in the merged reviews.' %len(merged_reviews))
#most_frequent_words_L = [] # list of most frequent words for review
most_frequent_tokens = populateWordFrequency(merged_reviews)
most_frequent_149_Words_L = []
for i in range(len(most_frequent_tokens[:149])):
  # populate the list of the reviews with the tokens of the frequent words as per requirements
  most_frequent_149_Words_L.append(most_frequent_tokens[i][0])

vocab_reviews = set(most_frequent_149_Words_L)
# add the 'unk' to make the vocabulary size 150
vocab_reviews.add('<unk>')

There are 2000 tokens in the merged reviews.


**create the lookup tables** (word -> index and index -> word)

In [None]:
word_to_ix = {word: i for i, word in enumerate(vocab_reviews)}
index_to_word = {i: word for (i, word) in enumerate(vocab_reviews)}

In [None]:
data_L = [] # initialize a list of data 
for num in range(len(filtered_L)):
  data=[]
  for i in range(CONTEXT_SIZE, len(filtered_L[num]) - CONTEXT_SIZE):
    raw_text = filtered_L[num]
    context = (
        [raw_text[i - CONTEXT_SIZE], raw_text[i - (CONTEXT_SIZE - 1)], \
              raw_text[i + (CONTEXT_SIZE - 1)], raw_text[i + CONTEXT_SIZE]]
    )
    #print(context)
    target = raw_text[i]
    # ignore the case where the target word is <unk>
    # ignore the case where either left or right side in the Context turns out to be made of just <unk>
    if target == '<unk>' or context[0] == '<unk>' or context[2] == '<unk>':
      pass
    else:
      data.append((context, target))
    #print(context, target)
  # add the data to the list of data 
  data_L.append(data)

In [None]:
# create the data by combining the list of the data set
data = list(itertools.chain.from_iterable(data_L))

In [None]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
      super(CBOW, self).__init__()
      self.embeddings = nn.Embedding(vocab_size, embedding_dim)
      #self.linear = nn.Linear(embedding_dim, 128) # hidden layer
      #self.activation_function1 = nn.ReLU() # activation layer
      #self.linear2 = nn.Linear(128, vocab_size) # another hidden layer
      # Applies the log(Softmax(x)) function to an n-dimensional input Tensor.
      self.activation_function2 = nn.LogSoftmax(dim = -1) # activation layer


    def forward(self, inputs):
      embeds = sum(self.embeddings(inputs)).view(1, -1)
      #out = self.linear(embeds)
      #out = self.activation_function1(out)
      #out = self.linear2(out)
      out = self.activation_function2(embeds)
      return out

In [None]:
losses = []
loss_function = nn.NLLLoss()
EMBEDDING_DIM = 20
VOCAB_SIZE = 150

model = CBOW(VOCAB_SIZE, EMBEDDING_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in data:


        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        #context_idxs = torch.tensor([word_to_ix.get(w, word_to_ix['<unk>']) for w in context if w == '<unk>' pass], dtype=torch.long)
        context_idxs = torch.tensor([word_to_ix.get(w, word_to_ix['<unk>']) for w in context if w != '<unk>'], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        if word_to_ix.get(target) is None:
          pass
        else:
          #print(log_probs.shape, torch.tensor([word_to_ix[target]], dtype=torch.long).shape)
          loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

          # Step 5. Do the backward pass and update the gradient
          loss.backward()
          optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
          total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
#print("The embedding vector for 'procecess' is:")
#print(model.embeddings.weight[word_to_ix["processes"]])

IndexError: ignored

Finally, show the embedding made for these words below.  Then find the top 3 most similar words (measured by cosine similarity) for each word (but not including itself), for both context sizes, and write your comments in the report.

1. titanic

In [None]:
def deriveCosineSim(word):
  """Function that derives the cosine similarity of any word"""
  process_idx = word_to_ix[word]
  processes_tensor = torch.tensor(process_idx, dtype=torch.long)
  process_similar = {}
  cos = nn.CosineSimilarity(dim=0)
  for i in range(len(data)):
    item = data[i][0]
    for num in range(len(item)):
      word_str = word + '-' + item[num]
      if item[num] == word:
        pass
      else:
        output = cos(model.embeddings.weight[word_to_ix[word]], model.embeddings.weight[word_to_ix.get(item[num], word_to_ix['<unk>'])])
        res = output.detach().numpy()
        ts = res.tobytes()
        arr = np.frombuffer(ts, dtype=res.dtype) # we change to an array
        process_similar[word_str] =  arr[0]
  return process_similar

In [None]:
titanic_processSim = deriveCosineSim('titanic')

In [None]:
# convert the dictionary into pandas data frame
df_cosineSimTitanic = pd.DataFrame(titanic_processSim.items(), columns=['similar-words', 'cosine_value'])

In [None]:
# sort the data frame
df_cosineSimTitanic_Sorted = df_cosineSimTitanic.sort_values(by='cosine_value', ascending=False)
print("The top three words that are closest to 'titanic' by cosine similarity:")
df_cosineSimTitanic_Sorted.head(3)

The top three words that are closest to 'titanic' by cosine similarity:


Unnamed: 0,similar-words,cosine_value
699,titanic-dreams,0.468602
98,titanic-leads,0.461909
120,titanic-man,0.418857


2.acting

In [None]:
acting_processSim = deriveCosineSim('acting')

In [None]:
# convert the dictionary into pandas data frame
df_cosineSimActing = pd.DataFrame(acting_processSim.items(), columns=['similar-words', 'cosine_value'])

In [None]:
# sort the data frame
df_cosineSimActing_Sorted = df_cosineSimActing.sort_values(by='cosine_value', ascending=False)
print("The top three words that are closest to 'acting' by cosine similarity:")
df_cosineSimActing_Sorted.head(3)

The top three words that are closest to 'acting' by cosine similarity:


Unnamed: 0,similar-words,cosine_value
48,acting-seen,0.573217
124,acting-dicaprio,0.572614
133,acting-two,0.506357


3.great

In [None]:
great_processSim = deriveCosineSim('great')

In [None]:
# convert the dictionary into pandas data frame
df_cosineSimGreat = pd.DataFrame(great_processSim.items(), columns=['similar-words', 'cosine_value'])

In [None]:
# sort the data frame
df_cosineSimGreat_Sorted = df_cosineSimGreat.sort_values(by='cosine_value', ascending=False)
print("The top three words that are closest to 'great' by cosine similarity:")
df_cosineSimGreat_Sorted.head(3)

The top three words that are closest to 'great' by cosine similarity:


Unnamed: 0,similar-words,cosine_value
197,great-explorers,0.48418
647,great-lovett,0.451797
709,great-complex,0.445936


4.poor

In [None]:
poor_processSim = deriveCosineSim('poor')

In [None]:
# convert the dictionary into pandas data frame
df_cosineSimPoor = pd.DataFrame(poor_processSim.items(), columns=['similar-words', 'cosine_value'])

In [None]:
# sort the data frame
df_cosineSimPoor_Sorted = df_cosineSimPoor.sort_values(by='cosine_value', ascending=False)
print("The top three words that are closest to 'poor' by cosine similarity:")
df_cosineSimPoor_Sorted.head(3)

The top three words that are closest to 'poor' by cosine similarity:


Unnamed: 0,similar-words,cosine_value
183,poor-far,0.467065
353,poor-much,0.460778
225,poor-cal,0.448636
