In [1]:
text = """The quick brown fox jumps over the lazy dog.
This is a well-known sentence used for typing practice.
It contains every letter of the English alphabet,
which makes it a good example for demonstrating text processing techniques.
With enough training data, a model can learn to predict the next word in a sequence accurately.
Machine learning and natural language processing have advanced significantly, enabling the development of
sophisticated models that understand and generate human-like text. By feeding the LSTM model with this example text,
we can train it to predict the next word in a sequence. This is a useful exercise for understanding how recurrent neural networks
work and how they can be applied to language modeling tasks."""


In [2]:
text_list = text.split()

modified_list = []

for element in text_list:
  if element[-1] == ".":
    element = element[0:-1]
  modified_list.append(element)

print(modified_list)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'This', 'is', 'a', 'well-known', 'sentence', 'used', 'for', 'typing', 'practice', 'It', 'contains', 'every', 'letter', 'of', 'the', 'English', 'alphabet,', 'which', 'makes', 'it', 'a', 'good', 'example', 'for', 'demonstrating', 'text', 'processing', 'techniques', 'With', 'enough', 'training', 'data,', 'a', 'model', 'can', 'learn', 'to', 'predict', 'the', 'next', 'word', 'in', 'a', 'sequence', 'accurately', 'Machine', 'learning', 'and', 'natural', 'language', 'processing', 'have', 'advanced', 'significantly,', 'enabling', 'the', 'development', 'of', 'sophisticated', 'models', 'that', 'understand', 'and', 'generate', 'human-like', 'text', 'By', 'feeding', 'the', 'LSTM', 'model', 'with', 'this', 'example', 'text,', 'we', 'can', 'train', 'it', 'to', 'predict', 'the', 'next', 'word', 'in', 'a', 'sequence', 'This', 'is', 'a', 'useful', 'exercise', 'for', 'understanding', 'how', 'recurrent', 'neural', 'networks', 'work', 

In [3]:
set_chars = set(modified_list)

In [4]:
print(len(set_chars), len(modified_list))

85 118


In [5]:
vocab = sorted(set_chars)

word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for idx, word in enumerate(vocab)}

encode = lambda s: [word_to_idx[w] for w in s.split()]
decode = lambda l: ' '.join([idx_to_word[i] for i in l])

In [6]:
decode(encode("The quick brown fox"))

'The quick brown fox'

In [7]:
import torch

text = text.replace('.', '')

data = encode(text)

In [8]:
sequences = []
targets = []

temp = 0

sequence_length = 3

for i in range(0, len(data) - sequence_length):
  temp = i
  sequences.append(data[i : i + sequence_length])
  targets.append(data[i + sequence_length])

In [9]:
import torch.optim as optim
import torch.nn as nn

class nextwordlstm(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
    super(nextwordlstm, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
    self.fc = nn.Linear(hidden_dim, vocab_size)
  def forward(self, x):
    x = self.embedding(x)
    out, _ = self.lstm(x)
    out = self.fc(out[:, -1])
    return out

In [10]:
vocab_size = len(vocab)  # Size of the vocabulary
embedding_dim = 128      # Embedding dimensions for each word
hidden_dim = 256         # Hidden state dimensions
num_layers = 2           # Number of LSTM layers

model = nextwordlstm(vocab_size, embedding_dim, hidden_dim, num_layers)

print(model)

nextwordlstm(
  (embedding): Embedding(85, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=85, bias=True)
)


In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [12]:
num_epochs = 20
batch_size = 3


for epoch in range(num_epochs):
    for i in range(0, len(sequences), batch_size):
        inputs = torch.Tensor(sequences[i:i + batch_size]).to(torch.int)

        target_words = torch.Tensor(targets[i:i + batch_size]).to(torch.long)

        outputs = model(inputs)

        loss = criterion(outputs, target_words)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/20], Loss: 4.7200
Epoch [2/20], Loss: 5.0361
Epoch [3/20], Loss: 3.8997
Epoch [4/20], Loss: 0.9962
Epoch [5/20], Loss: 0.0012
Epoch [6/20], Loss: 0.0027
Epoch [7/20], Loss: 0.0022
Epoch [8/20], Loss: 0.0017
Epoch [9/20], Loss: 0.0014
Epoch [10/20], Loss: 0.0011
Epoch [11/20], Loss: 0.0010
Epoch [12/20], Loss: 0.0008
Epoch [13/20], Loss: 0.0007
Epoch [14/20], Loss: 0.0007
Epoch [15/20], Loss: 0.0006
Epoch [16/20], Loss: 0.0005
Epoch [17/20], Loss: 0.0005
Epoch [18/20], Loss: 0.0004
Epoch [19/20], Loss: 0.0004
Epoch [20/20], Loss: 0.0004


In [36]:
input_sequence = "By feeding the"


input_sequence = encode(input_sequence)
input_tensor = torch.Tensor(input_sequence).to(torch.int).unsqueeze(0)

with torch.no_grad():
    outputs = model(input_tensor)
    probabilities = nn.functional.softmax(outputs, dim=1)
    predicted_index = torch.argmax(probabilities, dim=1).item()

print(f'Predicted next word index: {predicted_index}')

Predicted next word index: 3


In [37]:
input_sequence

[0, 27, 68]

In [31]:
print(decode([0, 27, 68, 3]))

TypeError: 'NoneType' object is not iterable