### Next Word Prediction Model & how to build it using now Pytorch
1. We start collecting a diverse dataset of text documents.
2. Preprocess the data by cleaning and tokenizing it.
3. Prepare the data by creating input-output pairs.
4. Engineer features such as word embeddings.
5. Select an appropiate model like an LSTM or GPT.
6. Train the model on the dataset while adjusting the hyperparameters.
7. Improve the model by experimenting with different techniques and arquitectures.

In [1]:
# Import libraries
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.functional import one_hot
from torch.utils.data import Dataset, DataLoader


In [2]:
# Read the text file
with open( '/home/xamanek/PythonProjects/ML_Transformers_001/Datasets/20240205b_sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding = 'utf-8' ) as file:
  text = file.read()

# Simple tokenizer function
tokenizer = get_tokenizer( 'basic_english' )

# Tokenize the text 
tokens = tokenizer( text )

# Build vocabulary 
def yield_tokens( data_iter ):
  for text in data_iter:
    yield tokenizer( text )

vocab = build_vocab_from_iterator( yield_tokens( [ text ] ), specials = [ "<unk>" ] )
vocab.set_default_index( vocab[ "<unk>" ] )

# Numericalize tokens 
numericalized_tokens = [ vocab[ token ] for token in tokens ]

In [3]:
# Create n-gram sequences
input_sequences = [] 

for line in text.split('\n'):
  token_list = [ vocab[ token ] for token in tokenizer( line ) ]
  for i in range( 1, len( token_list ) ):
    n_gram_sequence = token_list[ :i + 1 ]
    input_sequences.append( torch.tensor( n_gram_sequence ) )

# Pad sequences
max_sequence_len = max( len(seq) for seq in input_sequences )
input_sequences_padded = pad_sequence( 
  input_sequences, 
  batch_first = True, 
  padding_value = 0 
)

# Create input and target sequences
X = input_sequences_padded[ :, :-1 ]
y = input_sequences_padded[ :, -1 ]

# One-hot encode the target sequences
y_one_hot = one_hot( y, num_classes = len( vocab ) ).float()

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [5]:
class LSTMModel( nn.Module ):
  def __init__( self, vocab_size, embedding_dim, lstm_units, output_size ):
    super( LSTMModel, self ).__init__()
    self.embedding = nn.Embedding( vocab_size, embedding_dim )
    self.lstm = nn.LSTM( embedding_dim, lstm_units, batch_first = True )
    self.fc = nn.Linear( lstm_units, output_size )

  def forward( self, x ):
    x = self.embedding( x )
    _, (hidden, _) = self.lstm( x )
    x = self.fc( hidden[ -1 ] )
    return x
  
# Instantiate the model
model = LSTMModel( len( vocab ), 100, 1500, len( vocab ) ).to( device )

print( model )

LSTMModel(
  (embedding): Embedding(8377, 100)
  (lstm): LSTM(100, 1500, batch_first=True)
  (fc): Linear(in_features=1500, out_features=8377, bias=True)
)


In [6]:
class TextDataset( Dataset ):
  def __init__( self, sequences, targets ):
    self.sequences = sequences
    self.targets = targets

  def __len__( self ):
    return len( self.sequences )
  
  def __getitem__( self, idx ):
    sequence = self.sequences[ idx ]
    target = self.targets[ idx ]
    return sequence, target

In [7]:
dataset = TextDataset( X, y_one_hot )

In [9]:
# Loss and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam( model.parameters(), lr = 0.001 )

# Training loop
for epoch in range(50):
  total_loss = 0
  for batch in DataLoader( dataset, batch_size = 128, shuffle = True ):
    inputs, targets = batch

    # Move to device
    inputs, targets = inputs.to(device), targets.to(device)

    model.zero_grad()
    output = model( inputs )
    loss = loss_function( output, targets )
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    
  print( f'Epoch {epoch + 1}, Loss: {total_loss / len(dataset)}, Perplexity: {np.exp( total_loss )}' )

Epoch 1, Loss: 1.8196415215484346e-06, Perplexity: 1.2259835766290064
Epoch 2, Loss: 1.6713863421536658e-06, Perplexity: 1.2058002535453238
Epoch 3, Loss: 1.5965744295668518e-06, Perplexity: 1.1957419152338284
Epoch 4, Loss: 1.8375160743032794e-06, Perplexity: 1.2284397120366486
Epoch 5, Loss: 1.5455572064553106e-06, Perplexity: 1.18893089392893
Epoch 6, Loss: 1.693082246037384e-06, Perplexity: 1.2087330271190317
Epoch 7, Loss: 1.5331565658420624e-06, Perplexity: 1.1872812239858006
Epoch 8, Loss: 1.5826129395763233e-06, Perplexity: 1.193874127108948
Epoch 9, Loss: 1.4525674600070327e-06, Perplexity: 1.1766160047883603
Epoch 10, Loss: 1.332992279677305e-06, Perplexity: 1.1609676199844143
Epoch 11, Loss: 1.3156537992329124e-06, Perplexity: 1.1587159359718915
Epoch 12, Loss: 8.842302124592415e-07, Perplexity: 1.104073335419174
Epoch 13, Loss: 7.147976119582842e-07, Perplexity: 1.0833251716822168
Epoch 14, Loss: 3.4676360313601398e-06, Perplexity: 1.4744244920838538
Epoch 15, Loss: 1.40548

In [None]:
# Set model to evaluation mode
model.eval()

# Initial text to generate next words
seed_text = "I don't know what to"
next_words = 5

for _ in range( next_words ):
  # Tokenize the seed text
  token_list = [ vocab[ token ] for token in tokenizer( seed_text ) ]

  # cut the sequence if it is longer than the maximum sequence length
  if len( token_list ) > max_sequence_len - 1:
    token_list = token_list[ -(max_sequence_len - 1) : ]

  # Pad the sequence
  token_tensor = torch.tensor( [ token_list ] ).to( device )
  token_tensor = pad_sequence( 
    token_tensor, 
    batch_first = True, 
    padding_value = 0
  )

  # Generate the next word
  # No need to track the gradients
  with torch.no_grad(): 
    # Get the index of the word with the highest probability
    predicted = model( token_tensor ).argmax( dim = 1 ).item()

  # find the word corresponding to the index 
  output_word = None 
  for word, index in vocab.items():
    if index == predicted:
      output_word = word
      break

  # if no word is found, terminate the loop
  if output_word is None:
    break


  # Add the predicted word to the seed text
  seed_text += ' ' + output_word

print( seed_text )

In [17]:
model.eval()

unique_tokens = sorted(set(token for token in tokenizer(text)))
stoi = {token: idx for idx, token in enumerate(unique_tokens)}
itos = {idx: token for token, idx in stoi.items()}

seed_text = "I don't know what to do with"
next_words = 10

for _ in range( next_words ):
  # Tokenize the seed thext
  token_list = [ 
    stoi[ token ] for token in tokenizer( seed_text ) 
    if token in stoi 
  ]

  # Cut the sequence if it is longer than the maximum sequence length
  if len( token_list ) > max_sequence_len -1: 
    token_list = token_list[ -( max_sequence_len - 1 ) : ]

  # Pad the sequence
  token_tensor = torch.tensor( [ token_list ] ).to( device )
  token_tensor = pad_sequence( 
      token_tensor, 
      batch_first = True, 
      padding_value = 0 
    )
  
  # Generate the next word, no need to track the gradients
  with torch.no_grad():
    # Get the index of the word with the highest probability
    predicted = model( token_tensor ).argmax( dim = 1 ).item()

  # use 'itos' to find the word corresponding to the index
  output_word = itos[ predicted ] if predicted < len( itos ) else None

  # if no word is found, terminate the loop
  if output_word is None:
    break

  # Add the predicted word to the seed text
  seed_text += ' ' + output_word

print( seed_text )

I don't know what to do with ! ! ! ! ! ! ! ! ! !
