In [0]:
#Importing Libraries
import torch
from torchtext import data
import numpy as np
import re
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import randint

In [2]:
# transfer the model on GPU
# check GPU available
if torch.cuda.is_available():
    cuda0 = torch.device("cuda:0") 
    print("Running on the GPU")
else:
    cuda0 = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [0]:
#Loading custom NLP files using TabularDataset
train_file = data.TabularDataset(
    path ='/content/drive/My Drive/Colab Notebooks/data/wiki.train.tokens.txt', 
    format='tsv',
    fields=[('text', data.Field())]
)


In [5]:
print("Total lines in train file : ",len(train_file))

Total lines in train file :  36718


The training data has : 1) Title, 2)Contents

One example is shown below

In [6]:
ex = train_file[1]
print("Title : "," ".join(ex.text))
ex = train_file[3]
print("Content : "," ".join(ex.text))
ex = train_file[2]
print("train_file[2] :", ex.text)

Title :  = Valkyria Chronicles III =
Content :  Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " .
train_file[2] : []


By looking at the data it is clear that there is 

*   Punctuations (e.g ':', '.', '@' etc)
*   Strange names (e.g Valkyria)
*   Quoted dialogue
*   Empty data [ ] (e.g train_file[2])



### Dataset creation

In [0]:
def clean_text_and_tokenization(rawfile):
  # text cleaning
  clean_doc=[]
  for i in range (len(rawfile)):
    if rawfile[i].text:
      clean_doc.append(rawfile[i].text)

  clean_doc = str(clean_doc).lower()
  clean_doc = re.sub(r'[\'|:|.|(|)|,|@|-|"\]|*|&|$|?|=|;]','',clean_doc)
  clean_doc = re.sub(r'\[','',clean_doc)    
  clean_doc = re.sub(r'-',' ',clean_doc)    # removes -
  clean_doc = re.sub(r'\'','',clean_doc)    # removes '
  clean_doc = re.sub(r'\d+','',clean_doc)   # remove digits
  clean_doc = re.sub(r' +',' ',clean_doc)   # removes more than one spaces
  
  # Tokenization
  doc_tokenize=[]
  for word in clean_doc.split(" "):
    if word:
      doc_tokenize.append(word)
  return doc_tokenize

In [8]:
clean_doc_tokenized = clean_text_and_tokenization(train_file)
print("Total number of words : ", len(clean_doc_tokenized))
print("unique number of words : ", len(set(clean_doc_tokenized)))

Total number of words :  1701090
unique number of words :  27401


In [0]:
sequence_length = 10+1 # 11 length sequence words, first 10 will be input words and 11th is the target/output
def create_dataset(clean_doc_tokenized,N):
  dataset =[]
  for i in range(len(clean_doc_tokenized)-N+1):
    context = clean_doc_tokenized[i:i+N-1]
    target = clean_doc_tokenized[i+N-1]
    dataset.append((context,target))

  return dataset

In [0]:
#For the ease of training, i am considering only first 100K words from corpus
clean_doc_tokenized_truncated = clean_doc_tokenized[:10000+1] 
dataset=create_dataset(clean_doc_tokenized_truncated,sequence_length)

### Embedding + Training

In [13]:
unique_words = set(clean_doc_tokenized_truncated) # unique words present in the corpus
word_to_ids={}
for i,word in enumerate(unique_words):
  word_to_ids[word] = i

print("vocabulary size :", len(unique_words))

vocabulary size : 2318


In [0]:
class NGramLanguageModeler(nn.Module):

  def __init__(self, vocab_size, embedding_dim, context_size):
    super(NGramLanguageModeler, self).__init__()
    # nn.Embedding uses "encoding lexical sementics" technique
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)     #embedding layer initialization
    self.linear1 = nn.Linear(context_size * embedding_dim, 4096)  #linear layer
    self.linear2 = nn.Linear(4096, vocab_size)                    #output layer , out dim = vocab_size

  def forward(self, inputs):
    embeds = self.embeddings(inputs).view((1, -1))
    out = F.relu(self.linear1(embeds))
    out = self.linear2(out)
    log_probs = F.log_softmax(out, dim=1)
    return log_probs

In [0]:
def train_model(model):
  model = model.to(cuda0)  # to transfer model on gpu
  losses=[]
  i=1
  for epoch in range(no_of_epoches):
    loss_per_epoch =0.0
    for context, target in dataset:
      context_ids = torch.tensor([word_to_ids[w] for w in context] ,dtype=torch.long )
      context_ids = context_ids.to(cuda0) # transfer input on gpu
      target_ids = torch.tensor([word_to_ids[target]], dtype=torch.long)
      target_ids = target_ids.to(cuda0) # transfer output on gpu
      model.zero_grad()
      output = model(context_ids)
      loss = loss_function(output,target_ids)
      loss.backward()
      optimizer.step()
      loss_per_epoch += loss.item()

    print("EPOCH : "+str(epoch+1)+" | Loss : "+str(loss_per_epoch/9991))


In [17]:
#Hyperparameters
embedding_dimensions = 10 # one word is converted into a vector of 10 dimensions
context_size = sequence_length -1       # no of input words for training the model
learning_rate = 0.001     
momentum = 0.9
no_of_epoches =7

#defining model
model = NGramLanguageModeler(len(unique_words), embedding_dimensions, context_size)
# defining loss function as negative log likelihood loss
loss_function = nn.NLLLoss()
# defining optimizer as stochastic gradient descent
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

#Train the model
train_model(model)

EPOCH : 1 | Loss : 6.6577825699467255
EPOCH : 2 | Loss : 4.461004046727535
EPOCH : 3 | Loss : 2.59800486936427
EPOCH : 4 | Loss : 0.9943666979305117
EPOCH : 5 | Loss : 0.3737648253373562
EPOCH : 6 | Loss : 0.22689246281812933
EPOCH : 7 | Loss : 0.14445581031435067


### Generating text

In [0]:
def generate_words(N_words,model,input_text):
  print("Input sequence of given words : "," ".join(input_text))
  text = ""
  # put model on evaluation mode
  model.eval()

  for i in range(N_words):
    input_text_ids = torch.tensor([word_to_ids[w] for w in input_text], dtype=torch.long)
    input_text_ids = input_text_ids.to(cuda0)
    output = model(input_text_ids)
    _, max_index  = torch.max(output, 1) # picking the maximum probability word
    for word, index in word_to_ids.items():
      if index == max_index.item():
        text = text+' '+word
        input_text.append(word)
        break
    input_text = input_text[1:]

  print("Generated words :", text)


In [19]:
input_sequence_of_words = dataset[randint(0, len(dataset))][0]
N_words = 20 # number of words to be generated
generate_words(N_words,model,input_sequence_of_words)

Input sequence of given words :  take possession of the arsenal in the name of the
Generated words :  united states the soldiers would be allowed safe passage in any direction carrying any personal and public property besides munitions


**Note:** The training corpus has 100K words, and so the vocabulary size is around 2K. With the increase in training corpus and vocab size, the generated words will be more diversified.

Moreover, no pre-trained embedding is used in this code. nn.Embedding layer from pytorch is used to create word vectors implicitly.