# 1. Simple One-Layer LSTM

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Text preprocessing
text = "Tell me and I forget, teach me and I may remember, involve me and I learn"
words = text.lower().replace(',', '').split()
vocab = {word: i for i, word in enumerate(set(words))}
vocab_size = len(vocab)

# Prepare data for the model
inputs = []
targets = []
context_size = 3

for i in range(len(words) - context_size):
    input_idx = [vocab[words[j]] for j in range(i, i + context_size)]
    target_idx = vocab[words[i + context_size]]
    inputs.append(input_idx)
    targets.append(target_idx)

inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

# Create dataset and DataLoader
dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define an LSTM model
class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SimpleLSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # LSTM layer, unidirectional and one layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # Output layer
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        # Input shape: (batch_size, sequence_length)
        embeds = self.embeddings(inputs)  # (batch_size, sequence_length, embedding_dim)
        lstm_out, (h_n, c_n) = self.lstm(embeds)     # lstm_out: (batch_size, sequence_length, hidden_dim)
        # We use the last LSTM output as the representation of the sequence
        final_output = lstm_out[:, -1, :] # (batch_size, hidden_dim)
        out = self.fc(final_output)       # (batch_size, vocab_size)
        return out

# Model parameters
embedding_dim = 10
hidden_dim = 20

# Initialize the LSTM model, loss function, and optimizer
model = SimpleLSTM(vocab_size, embedding_dim, hidden_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 300
for epoch in range(epochs):
    total_loss = 0
    for context, target in dataloader:
        model.zero_grad()
        log_probs = model(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(dataloader)}')

# Prediction function
def predict(text):
    words = text.lower().replace(',', '').split()
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]
    input_tensor = torch.tensor([input_idx], dtype=torch.long)
    with torch.no_grad():
        log_probs = model(input_tensor)
    return max(zip(log_probs[0].exp(), vocab.keys()), key=lambda p: p[0])[1]

# Test the prediction function
print(predict("teach me and"))  # Expected to output 'I', given the training context.


Epoch 0, Loss: 2.3626063466072083
Epoch 50, Loss: 1.2420078814029694
Epoch 100, Loss: 0.6485145222395658
Epoch 150, Loss: 0.38657374307513237
Epoch 200, Loss: 0.27861463837325573
Epoch 250, Loss: 0.2532136728987098
i


## 1.1. Test

In [24]:
# Improved Prediction Function to predict all possible next words with their probabilities
def predict_all_scores(text):
    """
    Predicts the probabilities of all possible next words based on the last few words of the input text.
    
    Args:
    text (str): A string of text from which the last few words are taken as context for prediction.
    
    Returns:
    dict: A dictionary of words and their associated probabilities, sorted by likelihood.
    """
    # Normalize the input text: convert to lowercase and remove commas for consistent preprocessing
    words = text.lower().replace(',', '').split()

    # Convert the last few words to indices using the vocabulary.
    # Use a default index (0) if a word is not found in the vocabulary.
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]

    # Create a tensor from the word indices, suitable for model input
    input_tensor = torch.tensor([input_idx], dtype=torch.long)

    # Disable gradient calculations for efficiency during inference
    with torch.no_grad():
        # Pass the tensor through the model to get log probabilities for each class (word)
        log_probs = model(input_tensor)

        # Apply softmax to convert log probabilities to actual probabilities
        probs = torch.softmax(log_probs, dim=1)

    # Convert the probabilities tensor to a list for easier processing
    all_probs = probs.squeeze().tolist()

    # Map each word in the vocabulary to its predicted probability
    predicted_words_scores = {word: prob for word, idx in vocab.items() if idx < len(all_probs) for prob in [all_probs[idx]]}

    # Sort the predictions by their probabilities in descending order
    predicted_words_scores = dict(sorted(predicted_words_scores.items(), key=lambda item: item[1], reverse=True))

    return predicted_words_scores

# Test the updated prediction function with different contexts
contexts = ["teach me and", "and I may", "and I do", "tell me next", "do not tell"]
for context in contexts:
    predicted_scores = predict_all_scores(context)
    print(f"Context: '{context}'")
    for word, score in predicted_scores.items():
        print(f'Word: "{word}", Score: {score:.4f}')
    print("\n")


Context: 'teach me and'
Word: "i", Score: 0.9890
Word: "and", Score: 0.0048
Word: "forget", Score: 0.0019
Word: "may", Score: 0.0019
Word: "learn", Score: 0.0014
Word: "tell", Score: 0.0004
Word: "remember", Score: 0.0003
Word: "me", Score: 0.0002
Word: "involve", Score: 0.0001
Word: "teach", Score: 0.0000


Context: 'and I may'
Word: "remember", Score: 0.9251
Word: "involve", Score: 0.0204
Word: "teach", Score: 0.0189
Word: "and", Score: 0.0113
Word: "forget", Score: 0.0076
Word: "may", Score: 0.0061
Word: "learn", Score: 0.0041
Word: "me", Score: 0.0035
Word: "i", Score: 0.0020
Word: "tell", Score: 0.0008


Context: 'and I do'
Word: "remember", Score: 0.2144
Word: "forget", Score: 0.2104
Word: "teach", Score: 0.1924
Word: "may", Score: 0.1814
Word: "learn", Score: 0.1023
Word: "i", Score: 0.0438
Word: "involve", Score: 0.0299
Word: "and", Score: 0.0102
Word: "me", Score: 0.0102
Word: "tell", Score: 0.0050


Context: 'tell me next'
Word: "i", Score: 0.9883
Word: "and", Score: 0.0037
W

## 1.2. Two-Layer LSTM

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Text preprocessing
text = "Tell me and I forget, teach me and I may remember, involve me and I learn"
words = text.lower().replace(',', '').split()
vocab = {word: i for i, word in enumerate(set(words))}
vocab_size = len(vocab)

# Prepare data for the model
inputs = []
targets = []
context_size = 3

for i in range(len(words) - context_size):
    input_idx = [vocab[words[j]] for j in range(i, i + context_size)]
    target_idx = vocab[words[i + context_size]]
    inputs.append(input_idx)
    targets.append(target_idx)

inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

# Create dataset and DataLoader
dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define a neural network model with two distinct LSTM layers
class DoubleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(DoubleLSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # First LSTM layer
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # Second LSTM layer
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        # Linear layer after the first LSTM
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        # Output linear layer after the second LSTM
        self.fc2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)  # Embed the input words
        lstm1_out, _ = self.lstm1(embeds)  # First LSTM layer output
        lstm1_out = self.fc1(lstm1_out[:, -1, :])  # Process output of first LSTM with a FC layer
        
        # Feed the output of the first FC layer into the second LSTM
        lstm2_out, _ = self.lstm2(lstm1_out.unsqueeze(1))
        final_output = self.fc2(lstm2_out[:, -1, :])  # Process output of second LSTM with the final FC layer
        
        return final_output

# Model parameters
embedding_dim = 10
hidden_dim = 20

# Initialize the model, loss function, and optimizer
model = DoubleLSTM(vocab_size, embedding_dim, hidden_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 300
for epoch in range(epochs):
    total_loss = 0
    for context, target in dataloader:
        model.zero_grad()
        log_probs = model(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(dataloader)}')

# Prediction function
def predict(text):
    words = text.lower().replace(',', '').split()
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]
    input_tensor = torch.tensor([input_idx], dtype=torch.long)
    with torch.no_grad():
        log_probs = model(input_tensor)
    return max(zip(log_probs[0].exp(), vocab.keys()), key=lambda p: p[0])[1]

# Test the prediction function
print(predict("teach me and"))  # Expected to output 'I', given the training context.


Epoch 0, Loss: 2.2947487235069275
Epoch 50, Loss: 1.6331643462181091
Epoch 100, Loss: 0.5038040541112423
Epoch 150, Loss: 0.3299543224275112
Epoch 200, Loss: 0.506902676075697
Epoch 250, Loss: 0.25374281127005816
i


In [26]:
# Improved Prediction Function to predict all possible next words with their probabilities
def predict_all_scores(text):
    """
    Predicts the probabilities of all possible next words based on the last few words of the input text.
    
    Args:
    text (str): A string of text from which the last few words are taken as context for prediction.
    
    Returns:
    dict: A dictionary of words and their associated probabilities, sorted by likelihood.
    """
    # Normalize the input text: convert to lowercase and remove commas for consistent preprocessing
    words = text.lower().replace(',', '').split()

    # Convert the last few words to indices using the vocabulary.
    # Use a default index (0) if a word is not found in the vocabulary.
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]

    # Create a tensor from the word indices, suitable for model input
    input_tensor = torch.tensor([input_idx], dtype=torch.long)

    # Disable gradient calculations for efficiency during inference
    with torch.no_grad():
        # Pass the tensor through the model to get log probabilities for each class (word)
        log_probs = model(input_tensor)

        # Apply softmax to convert log probabilities to actual probabilities
        probs = torch.softmax(log_probs, dim=1)

    # Convert the probabilities tensor to a list for easier processing
    all_probs = probs.squeeze().tolist()

    # Map each word in the vocabulary to its predicted probability
    predicted_words_scores = {word: prob for word, idx in vocab.items() if idx < len(all_probs) for prob in [all_probs[idx]]}

    # Sort the predictions by their probabilities in descending order
    predicted_words_scores = dict(sorted(predicted_words_scores.items(), key=lambda item: item[1], reverse=True))

    return predicted_words_scores

# Test the updated prediction function with different contexts
contexts = ["teach me and", "and I may", "and I do", "tell me next", "do not tell"]
for context in contexts:
    predicted_scores = predict_all_scores(context)
    print(f"Context: '{context}'")
    for word, score in predicted_scores.items():
        print(f'Word: "{word}", Score: {score:.4f}')
    print("\n")


Context: 'teach me and'
Word: "i", Score: 0.9759
Word: "and", Score: 0.0097
Word: "forget", Score: 0.0031
Word: "learn", Score: 0.0030
Word: "may", Score: 0.0027
Word: "me", Score: 0.0020
Word: "teach", Score: 0.0017
Word: "remember", Score: 0.0016
Word: "tell", Score: 0.0003
Word: "involve", Score: 0.0001


Context: 'and I may'
Word: "remember", Score: 0.9536
Word: "involve", Score: 0.0221
Word: "learn", Score: 0.0048
Word: "may", Score: 0.0041
Word: "i", Score: 0.0041
Word: "and", Score: 0.0039
Word: "forget", Score: 0.0035
Word: "me", Score: 0.0025
Word: "tell", Score: 0.0011
Word: "teach", Score: 0.0003


Context: 'and I do'
Word: "i", Score: 0.2717
Word: "forget", Score: 0.2306
Word: "learn", Score: 0.2054
Word: "may", Score: 0.1862
Word: "me", Score: 0.0598
Word: "remember", Score: 0.0194
Word: "teach", Score: 0.0180
Word: "and", Score: 0.0053
Word: "tell", Score: 0.0022
Word: "involve", Score: 0.0013


Context: 'tell me next'
Word: "i", Score: 0.9759
Word: "and", Score: 0.0096
W