In [45]:
import numpy as np
import nltk
nltk.download('punkt')
nltk.download("gutenberg") 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import time

[nltk_data] Downloading package punkt to /home1/tejomay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home1/tejomay/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home1/tejomay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home1/tejomay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home1/tejomay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


<h2>Obtaining data from NLTK</h2>

In [2]:
def preprocess_corpus(data):

  stop_words = set(stopwords.words('english'))   
  punctuation = {'.','!', "'", "''", '(', ')', ',', '.', ':', ';', '?', '[', ']', '``', ' ', '_', '"','*','%','$','&','+','-','/','\\','`','<','>','=','@'}

  cleaned_data = []
  unique_words = []
  length = 0

  lemmatizer = WordNetLemmatizer()

  for sent in data:
    new_sent = []

    for word in sent:
      # removing any punctuations appended with the word
      for w in word: 
          if(w in punctuation or w.isnumeric()):
            word = word.replace(w,'')

      # lowercasing the word
      word = word.lower()
      # removing stopwords and digits
      if(word not in stop_words and word.isnumeric() == False and word !=''): 
          # lemmatize
#           word = lemmatizer.lemmatize(word)  
          # adding cleaned words to the sentence
          new_sent.append(word)
          # checking for unique words
          if(word not in unique_words):
            unique_words.append(word)

    # adding valid sentences to the data
    if(len(new_sent) > 1):
#         final_sent = []
#         for word in new_sent:
#             if word not in final_sent:
#                 final_sent.append(word)
        cleaned_data.append(new_sent)
        length += len(new_sent)

  print("Average length of sentences : ", length/(len(cleaned_data)))

  return cleaned_data, unique_words

  
def create_vocab(words):
    sorted_words = sorted(words) 
    vocab = {word: sorted_words.index(word) for word in sorted_words}
    vocab['.'] = len(sorted_words)
    return sorted_words, vocab


def word_to_onehot(word, vocab):
  onehot = np.zeros((len(vocab),))
  # print(word)
  onehot[vocab[word]] = 1.0
  return onehot

In [3]:
# create data instances in the form of [context_1, context_2,....,context_n, label word]
def create_data(sentences, window_size=1):
    created_data = []
    count = 0
    for sentence in sentences:
        sentence = ['.']*window_size + sentence + ['.']*window_size 
        for i in range(window_size, len(sentence) - window_size):
            context_and_word = []
            for j in range(i-window_size, i+window_size+1):
                if j != i:
                    context_and_word.append(sentence[j])
            context_and_word.append(sentence[i])
            created_data.append(context_and_word)
        count += 1
        if count%10000 == 0:
            print(f"{count} sentences processed.")

    return created_data

In [4]:
sentences = nltk.corpus.gutenberg.sents()
# print("No. of sentences: ",len(sentences))
# print("A sample sentence: ",sentences[0])
data, unique_words = preprocess_corpus(sentences)
print("No. of samples: ",len(data))
print("A sample sentence: ", data[0]) 
print("No. of unique words: ", len(unique_words))
sorted_words, vocab = create_vocab(unique_words)

Average length of sentences :  11.38591095652952
No. of samples:  89417
A sample sentence:  ['emma', 'jane', 'austen']
No. of unique words:  41361


In [5]:
final_data = create_data(data, window_size=1)

5000 sentences processed.
10000 sentences processed.
15000 sentences processed.
20000 sentences processed.
25000 sentences processed.
30000 sentences processed.
35000 sentences processed.
40000 sentences processed.
45000 sentences processed.
50000 sentences processed.
55000 sentences processed.
60000 sentences processed.
65000 sentences processed.
70000 sentences processed.
75000 sentences processed.
80000 sentences processed.
85000 sentences processed.


<h2>Training</h2>

In [77]:
#implements forward and backward pass given batch and weight, returns updated weights
def single_pass(context, label_word, context_weight, softmax_weight, lr):  # data in the form of list of context words
    # context word: batch_size x |V|   context_weight: |V| x d    softmax_weight: d x |V|
    net = label_word @ context_weight
    losses = []
    d_net = np.zeros(net.shape)
    d_softmax_weight = np.zeros(softmax_weight.shape)
    #forward pass
    for word in context:
        h = (net @ softmax_weight)
        exp_out = np.exp(h - h.max(1, keepdims=True))
        out = exp_out / np.sum(exp_out, axis=1, keepdims=True)
        argmax_label = np.argmax(word, axis=1)
        context_loss = (-1.0 / out.shape[0]) * np.sum(np.log(1e-16 + out[np.arange(out.shape[0]), argmax_label]))
        losses.append(context_loss)
    
    #backward pass
        interm = -word * (1.0 - out)
        d_softmax_weight += net.T @ interm
        d_net = interm @ softmax_weight.T
    
    d_softmax_weight /= len(context)
    d_net /= len(context)
    d_context_weight = label_word.T @ d_net

    softmax_weight -= lr * d_softmax_weight
    context_weight -= lr * d_context_weight
    loss = sum(losses) / len(context)
    
    return loss, context_weight, softmax_weight

# returns context and label in the form of onehot vectors
def convert_to_onehot(batch):
    context = []
    label_word = np.zeros((len(batch), len(vocab)))
    for i in range(len(batch[0])-1):
        vectors = np.zeros((len(batch), len(vocab)))
        for j in range(len(batch)):
            onehot = word_to_onehot(batch[j][i], vocab)
            vectors[j,:] = onehot
        context.append(vectors)
    for j in range(len(batch)):
        onehot = word_to_onehot(batch[j][-1], vocab)
        label_word[j,:] = onehot
    
    return context, label_word


# data shape: N x C x |V|
# data has first C-1 elements as context and last element as label word
def train(data, batch_size=4, num_epochs=100, lr=0.01, dim=100, method="cbow"):
    context_weights = np.random.randn(len(vocab), dim)
    softmax_weights = np.random.randn(dim, len(vocab))
    
    for epoch in range(num_epochs):
        step = 0
        losses = []
        tolerance = 0
        final_weights = [context_weights, softmax_weights]
        time_i = time.time()
        epoch_loss = 0.0
        for i in range(0, len(data), batch_size):

            batch = data[i:i+batch_size]
            context, label_word = convert_to_onehot(batch)
            loss, context_weights, softmax_weights = single_pass(context, label_word, context_weights, softmax_weights, lr)
            epoch_loss += loss
            step += 1
            if step % 50 == 0:
                time_j = time.time()
                print(f"Epoch: {epoch+1} | Step: {step} | Loss: {loss:.5f}")
        time_j = time.time()
        epoch_loss /= step
        print(f"Epoch {1} finished | Epoch loss: {epoch_loss} | Time: {(time_j - time_i) / 60 : .5f} mins")
        losses.append(epoch_loss)
        if len(losses) > 10:
            if losses[epoch] >= losses[epoch-1]:
                tolerance += 1
            else:
                final_weights = [context_weights, softmax_weights]
                tolerance = 0
            if tolerance > 3:
                break
            
    return final_weights

In [78]:
final_weights = train(final_data[:256000], batch_size=1024, num_epochs=30, lr=0.01, dim=100)

Epoch: 1 | Step: 50 | Loss: 33.19232
Epoch: 1 | Step: 100 | Loss: 31.65288
Epoch: 1 | Step: 150 | Loss: 34.51743
Epoch: 1 | Step: 200 | Loss: 34.91714
Epoch: 1 | Step: 250 | Loss: 35.08074
Epoch 1 finished | Epoch loss: 33.511906310587555 | Time:  26.70667 mins


KeyboardInterrupt: 

(4,)