<a href="https://colab.research.google.com/github/Shiva-Gangadhar/Next-Word-Predictor-Pytorch/blob/main/Next_word_predictor_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [3]:
with open("/content/dataset_book.txt","r", encoding = "utf-8") as file:
  document = file.read()

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
tokens = word_tokenize(document.lower())

In [6]:
vocab = {'<unk>':0}

for token in Counter(tokens).keys():
  if token not in vocab:
    vocab[token]=len(vocab)

In [7]:
input_sentences = document.split('\n')

In [8]:
def text_to_indices(sentence,vocab):
  numerical_sentences = []
  for token in sentence:
    if token in vocab:
      numerical_sentences.append(vocab[token])
    else:
      numerical_sentences.append(vocab['<unk>'])

  return numerical_sentences

In [9]:
input_numerical_sequences = []
for sentence in input_sentences:
  input_numerical_sequences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))

In [10]:
len(input_numerical_sequences)

14413

In [11]:
training_sequence = []

for sentence in input_numerical_sequences:

  for i in range(1,len(sentence)):
    training_sequence.append(sentence[:i+1])


In [12]:
len(training_sequence)

134791

In [13]:
max_length_sequence = max(len(sequence) for sequence in training_sequence)
print(max_length_sequence)

23


In [14]:
padded_training_sequence = []

for sequence in training_sequence:

   padded_training_sequence.append([0]*(max_length_sequence-len(sequence)) + sequence)

In [15]:
len(padded_training_sequence[0])
padded_training_sequence[0]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3]

In [16]:
padded_training_sequence = torch.tensor(padded_training_sequence , dtype=torch.long)

In [17]:
x = padded_training_sequence[:, :-1]
y = padded_training_sequence[:, -1]

In [18]:
class CustomDataset(Dataset):

  def __init__(self,x,y):
    self.x=x
    self.y=y

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, index):
    return self.x[index], self.y[index]

In [19]:
dataset = CustomDataset(x,y)

In [20]:
len(dataset)

134791

In [21]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [22]:
class LSTModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self,x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_cell_state.squeeze(0))
    return output

In [23]:
model = LSTModel(len(vocab))

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTModel(
  (embedding): Embedding(7718, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=7718, bias=True)
)

In [27]:
epochs = 100
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [28]:
for epoch in range(epochs):

  total_loss=0

  for batch_x, batch_y in dataloader:

    batch_x, batch_y= batch_x.to(device), batch_y.to(device)

    optimizer.zero_grad()

    output = model(batch_x)

    loss = criterion(output, batch_y)

    loss.backward()

    optimizer.step()

    total_loss=total_loss+loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:.4f}")


Epoch: 1, Loss: 4248.4779
Epoch: 2, Loss: 3865.4048
Epoch: 3, Loss: 3821.7661
Epoch: 4, Loss: 3783.6457
Epoch: 5, Loss: 3781.4608
Epoch: 6, Loss: 3728.6767
Epoch: 7, Loss: 3716.2235
Epoch: 8, Loss: 3708.2423
Epoch: 9, Loss: 3688.0067
Epoch: 10, Loss: 3666.9441
Epoch: 11, Loss: 3665.4220
Epoch: 12, Loss: 3650.2961
Epoch: 13, Loss: 3630.7963
Epoch: 14, Loss: 3627.6593
Epoch: 15, Loss: 3620.7096
Epoch: 16, Loss: 3601.4918
Epoch: 17, Loss: 3602.9764
Epoch: 18, Loss: 3575.1214
Epoch: 19, Loss: 3566.6444
Epoch: 20, Loss: 3568.0377
Epoch: 21, Loss: 3577.1904
Epoch: 22, Loss: 3554.3042
Epoch: 23, Loss: 3557.4251
Epoch: 24, Loss: 3543.3826
Epoch: 25, Loss: 3552.0944
Epoch: 26, Loss: 3515.3904
Epoch: 27, Loss: 3521.7903
Epoch: 28, Loss: 3496.3696
Epoch: 29, Loss: 3508.8561
Epoch: 30, Loss: 3503.9106
Epoch: 31, Loss: 3491.5600
Epoch: 32, Loss: 3475.1335
Epoch: 33, Loss: 3494.6221
Epoch: 34, Loss: 3476.6356
Epoch: 35, Loss: 3490.9594
Epoch: 36, Loss: 3483.1051
Epoch: 37, Loss: 3483.9592
Epoch: 38,

In [43]:
def prediction(model, vocab, text):

  tokenized_text = word_tokenize(text.lower())

  numerical_text = text_to_indices(tokenized_text, vocab)


  padded_text = torch.tensor([0]*(max_length_sequence-len(numerical_text))+numerical_text,dtype=torch.long).unsqueeze(0).to(device)

  output = model(padded_text)

  value, index = torch.max(output, dim=1)

  return text + " " + list(vocab.keys())[index]

In [44]:
prediction(model, vocab, "I was seized with a")

'I was seized with a young'

In [50]:
import time

num_tokens = 10
input_text = "I do not know"

for i in range(num_tokens):
  output_text = prediction(model, vocab, input_text)
  print(output_text)
  input_text = output_text
  time.sleep(0.5)

I do not know what
I do not know what a
I do not know what a occasion
I do not know what a occasion required
I do not know what a occasion required .
I do not know what a occasion required . i
I do not know what a occasion required . i pity
I do not know what a occasion required . i pity you
I do not know what a occasion required . i pity you ,
I do not know what a occasion required . i pity you , my


In [46]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [47]:
def calculate_accuracy(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 82.35%
