In [3]:
!pip install nltk



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [4]:
# Read document from a text file
with open("document.txt", "r", encoding="utf-8") as file:
    document = file.read()

# Optional: remove extra whitespace
document = document.strip()

print(document[:500])  # preview first 500 characters


Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.net


Title: The Adventures of Sherlock Holmes

Author: Arthur Conan Doyle

Release Date: November 29, 2002 [EBook #1661]
Last Updated: May 20, 2019

Language: English

Characte


In [5]:
# Tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
#tokenize
tokens=word_tokenize(document.lower())

In [68]:
#build vocab
vocab={"<unk>":0}
for token in Counter(tokens).keys():
  if token not in vocab:
    vocab[token]=len(vocab)


In [35]:
len(vocab)

9398

In [36]:
# extract sentences
input_sentences=document.split("\n")

In [37]:
def text_to_indices(sentence,vocab):
  numerical_sentence=[]
  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab["<unk>"])
  return numerical_sentence

In [38]:
input_numerical_sentences=[]
for sentence in input_sentences:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()),vocab))

In [39]:
training_sequence=[]

for sentence in input_numerical_sentences:
  for i in range(1,len(sentence)):
    training_sequence.append(sentence[:i+1])

In [40]:
len(training_sequence)# 942 sequence but different size

120233

In [41]:
len_list=[]
for sequence in training_sequence:
  len_list.append(len(sequence))
max(len_list)

29

In [42]:
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [43]:
len(padded_training_sequence[0])

29

In [44]:
padded_training_sequence=torch.tensor(padded_training_sequence,dtype=torch.long)

In [45]:
padded_training_sequence[0]
padded_training_sequence.shape

torch.Size([120233, 29])

In [46]:
X=padded_training_sequence[:,:-1]
y=padded_training_sequence[:,-1]

In [47]:
X

tensor([[   0,    0,    0,  ...,    0,    0,    1],
        [   0,    0,    0,  ...,    0,    1,    2],
        [   0,    0,    0,  ...,    1,    2,    3],
        ...,
        [   0,    0,    0,  ...,  120,  587, 1171],
        [   0,    0,    0,  ...,  587, 1171,  416],
        [   0,    0,    0,  ..., 1171,  416, 9111]])

In [48]:
y

tensor([   2,    3,    4,  ...,  416, 9111,   29])

In [49]:
class CustomDataset(Dataset):
  def __init__(self,X,y):
    self.X=X
    self.y=y
  def __len__(self):
    return self.X.shape[0]
  def __getitem__(self,index):
    return self.X[index],self.y[index]

In [50]:
dataset=CustomDataset(X,y)

In [51]:
len(dataset)

120233

In [52]:
dataset[0]

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1]),
 tensor(2))

In [53]:
dataloader=DataLoader(dataset,batch_size=32,shuffle=True)

In [55]:
class LSTMmodel(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,100)
    self.lstm=nn.LSTM(100,150,batch_first=True)
    self.fc=nn.Linear(150,vocab_size)
  def forward(self,x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [56]:
model=LSTMmodel(len(vocab))

In [57]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [58]:
model.to(device)

LSTMmodel(
  (embedding): Embedding(9398, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=9398, bias=True)
)

In [59]:
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [60]:
#training loop
for epoch in range(epochs):
  total_loss=0
  for batch_x, batch_y in dataloader:

    batch_x, batch_y = batch_x.to(device), batch_y.to(device)

    optimizer.zero_grad()

    output = model(batch_x)

    loss = criterion(output, batch_y)

    loss.backward()

    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 20571.7046
Epoch: 2, Loss: 17531.6517
Epoch: 3, Loss: 15961.5994
Epoch: 4, Loss: 14631.7610
Epoch: 5, Loss: 13454.5656
Epoch: 6, Loss: 12409.0624
Epoch: 7, Loss: 11473.7189
Epoch: 8, Loss: 10638.0386
Epoch: 9, Loss: 9877.1972
Epoch: 10, Loss: 9196.3769
Epoch: 11, Loss: 8583.8864
Epoch: 12, Loss: 8041.6318
Epoch: 13, Loss: 7546.4237
Epoch: 14, Loss: 7102.5307
Epoch: 15, Loss: 6702.5762
Epoch: 16, Loss: 6338.6051
Epoch: 17, Loss: 6015.0155
Epoch: 18, Loss: 5724.5960
Epoch: 19, Loss: 5459.3006
Epoch: 20, Loss: 5212.7613
Epoch: 21, Loss: 4998.8776
Epoch: 22, Loss: 4798.3291
Epoch: 23, Loss: 4614.3467
Epoch: 24, Loss: 4445.4035
Epoch: 25, Loss: 4293.2603
Epoch: 26, Loss: 4154.9932
Epoch: 27, Loss: 4019.7238
Epoch: 28, Loss: 3912.9030
Epoch: 29, Loss: 3792.7168
Epoch: 30, Loss: 3691.3301
Epoch: 31, Loss: 3602.3376
Epoch: 32, Loss: 3515.5354
Epoch: 33, Loss: 3442.1902
Epoch: 34, Loss: 3357.0523
Epoch: 35, Loss: 3305.5139
Epoch: 36, Loss: 3223.8003
Epoch: 37, Loss: 3175.1759
Ep

In [61]:
#prediction
def prediction(model,vocab,text,device):
  tokenized_text = word_tokenize(text.lower())

  numerical_text = text_to_indices(tokenized_text, vocab)

  padded_text = torch.tensor([0] * (61 - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0)

  padded_text = padded_text.to(device)

  output = model(padded_text)

  value, index = torch.max(output, dim=1)

  return text + " " + list(vocab.keys())[index]

In [64]:
prediction(model, vocab, "He is",device)

'He is still'

In [65]:
import time

num_tokens = 10
input_text = "He is"

for i in range(num_tokens):
  output_text = prediction(model, vocab, input_text,device)
  print(output_text)
  input_text = output_text
  time.sleep(0.5)

He is still
He is still with
He is still with sherlock
He is still with sherlock holmes
He is still with sherlock holmes .
He is still with sherlock holmes . “
He is still with sherlock holmes . “ is
He is still with sherlock holmes . “ is it
He is still with sherlock holmes . “ is it all
He is still with sherlock holmes . “ is it all over


In [66]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [67]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")

Model Accuracy: 85.27%
