# 1. Simple ANN

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Set seeds for reproducibility
torch.manual_seed(1)  # Sets the seed for PyTorch random number generators. This is important to ensure that the behavior of any operation that introduces randomness, such as the initialization of weights in neural networks and the shuffling of data in DataLoaders, is consistent across runs. Using a fixed seed value like 42 ensures that every execution of this code will produce the same results given the same inputs and model configuration. This is essential for debugging and comparing the performance impacts of any changes in a controlled manner.

# Text preprocessing: Convert the entire string to lowercase and split it into individual words, removing commas.
# This normalization helps ensure the model treats variations of the same word (e.g., with and without punctuation) as the same word.
text = "Tell me and I forget, teach me and I may remember, involve me and I learn"
words = text.lower().replace(',', '').split()

# Vocabulary creation: Map each unique word to a unique index. This numerical representation of words is necessary
# because models can only process numbers, not text.
vocab = {word: i for i, word in enumerate(set(words))}
vocab_size = len(vocab)  # The size of the vocabulary

# Prepare the input and target pairs for training the model.
# The input will be sequences of words (context), and the target will be the word that follows the context.
inputs = []
targets = []
context_size = 3  # The number of words considered as context for predicting the next word.

# Generate sequences of contexts and their corresponding target words.
for i in range(len(words) - context_size):
    input_idx = [vocab[words[j]] for j in range(i, i + context_size)]  # Indices of the context words
    target_idx = vocab[words[i + context_size]]  # Index of the target word
    inputs.append(input_idx)
    targets.append(target_idx)

# Convert the lists of indices into PyTorch tensors, which are optimized for performance in training.
inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

# Packaging the tensors into a dataset and a DataLoader for efficient batching during training.
dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define the neural network model structure.
class SimpleNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SimpleNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)  # Converts word indices to embeddings.
        self.fc1 = nn.Linear(embedding_dim * context_size, 50)     # First fully connected layer.
        self.fc2 = nn.Linear(50, 20)                               # Second fully connected layer.
        self.fc3 = nn.Linear(20, vocab_size)                       # Output layer that predicts the next word.

    def forward(self, inputs):
        # Define the forward pass through the network.
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))  # Flatten the embeddings.
        out = torch.relu(self.fc1(embeds))                           # Apply ReLU activation function for non-linearity.
        out = torch.relu(self.fc2(out))                              # Another ReLU activation for hidden layer.
        out = self.fc3(out)                                          # Output layer; no activation (logits).
        return out

# Parameters for the model.
embedding_dim = 10  # Size of the embedding vectors. Smaller for simplicity in this example.

# Initialize the model, loss function, and optimizer.
model = SimpleNN(vocab_size, embedding_dim)
loss_function = nn.CrossEntropyLoss()  # Suitable for classification tasks with multiple classes.
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with a learning rate of 0.001.

# Training loop: Iterate over the data multiple times (epochs) to optimize the model parameters.
epochs = 300
for epoch in range(epochs):
    total_loss = 0
    for context, target in dataloader:
        model.zero_grad()            # Clear old gradients from the last step.
        log_probs = model(context)   # Calculate the log probabilities of the next word.
        loss = loss_function(log_probs, target)  # Compute the cross-entropy loss.
        loss.backward()              # Perform backpropagation to calculate gradients.
        optimizer.step()             # Update the weights.
        total_loss += loss.item()    # Accumulate the loss for monitoring.
    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(dataloader)}')  # Print loss every 50 epochs for monitoring.

# Function to predict the next word given a string of text
def predict(text):
    """
    Predicts the next word based on the last few words of the input text using the trained model.
    This function preprocesses the text to fit the model's training setup, then performs a forward pass to predict.
    """
    words = text.lower().replace(',', '').split()
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]  # Convert last few words to indices.
    input_tensor = torch.tensor([input_idx], dtype=torch.long)
    with torch.no_grad():
        log_probs = model(input_tensor)
    return max(zip(log_probs[0].exp(), vocab.keys()), key=lambda p: p[0])[1]  # Return the most probable word.

# Test the prediction function
print(predict("teach me and"))  # Expected to output 'I', given the training context.


Epoch 0, Loss: 2.4476407170295715
Epoch 50, Loss: 0.38067219592630863
Epoch 100, Loss: 0.2197433365508914
Epoch 150, Loss: 0.21372364863054827
Epoch 200, Loss: 0.3830622520763427
Epoch 250, Loss: 0.20904743735445663
i


## 1.1. Testing the prediction function with various input scenarios

In [2]:
# Test case with a context seen during training
# "and I may" is a context similar to the training data, so the model should predict accurately
print(predict("and I may"))  # Expected output: 'remember', as it follows this context in the training data

remember


In [3]:
# Test case with a partially unseen context
# "and I do" includes words seen during training but not this exact sequence
# The model might struggle or default to a more frequent or statistically likely word
print(predict("and I do"))   # Output might be less accurate due to unseen sequence

remember


In [4]:
# Test case with entirely unseen context
# "tell me next" includes 'next' which never appears in the training data
# This showcases how the model handles completely novel input
print(predict("tell me next"))  # Output is unpredictable, illustrating limitations with unseen data

i


In [5]:
# Test case with another unseen context
# "do not tell" is not only a completely unseen phrase but also contains "not," which is not in the training data
# This tests the model's handling of completely novel words and sequences
print(predict("do not tell"))   # The prediction is likely to be unreliable due to the novel context

involve


## 1.2. Enhanced prediction function with detailed probability outputs

In [6]:
# Define the predict function to return all words and their probabilities
def predict_all_scores(text):
    # Convert the input text to lowercase and split it into words, removing any commas
    words = text.lower().replace(',', '').split()

    # Convert words to indices. Use a default index (0) if a word is not found in the vocabulary.
    # This handling is basic and assumes '0' index won't cause misinterpretations, typically it should be handled more carefully.
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]

    # Convert the list of indices into a tensor suitable for model input
    input_tensor = torch.tensor([input_idx], dtype=torch.long)

    # Disable gradient calculations, useful for inference to save memory and computations
    with torch.no_grad():
        # Pass the input tensor through the model to get the log probabilities of each class (word in this case)
        log_probs = model(input_tensor)

        # Apply softmax to the log probabilities to convert them to actual probabilities
        # The softmax function converts raw model outputs into a probability distribution
        probs = torch.softmax(log_probs, dim=1)

    # Extract the probabilities for each word in the vocabulary
    all_probs = probs.squeeze().tolist()  # Convert the tensor of probabilities to a list

    # Create a dictionary mapping each word to its predicted probability
    predicted_words_scores = {word: prob for word, idx in vocab.items() for prob in [all_probs[idx]]}

    # Sort the dictionary by probabilities in descending order to see the most likely words first
    predicted_words_scores = dict(sorted(predicted_words_scores.items(), key=lambda item: item[1], reverse=True))

    # Return the sorted dictionary of words and their associated probabilities
    return predicted_words_scores

# Test the updated prediction function with different contexts
# This will print out all words in the vocabulary with their associated probability scores

contexts = ["teach me and", "and I may", "and I do", "tell me next", "do not tell"]
for context in contexts:
    predicted_scores = predict_all_scores(context)
    print(f"Context: '{context}'")
    for word, score in predicted_scores.items():
        print(f'Word: "{word}", Score: {score:.4f}')
    print("\n")

Context: 'teach me and'
Word: "i", Score: 0.9997
Word: "teach", Score: 0.0001
Word: "remember", Score: 0.0000
Word: "me", Score: 0.0000
Word: "involve", Score: 0.0000
Word: "forget", Score: 0.0000
Word: "may", Score: 0.0000
Word: "learn", Score: 0.0000
Word: "and", Score: 0.0000
Word: "tell", Score: 0.0000


Context: 'and I may'
Word: "remember", Score: 0.9979
Word: "teach", Score: 0.0008
Word: "me", Score: 0.0005
Word: "i", Score: 0.0003
Word: "involve", Score: 0.0003
Word: "and", Score: 0.0001
Word: "forget", Score: 0.0000
Word: "tell", Score: 0.0000
Word: "learn", Score: 0.0000
Word: "may", Score: 0.0000


Context: 'and I do'
Word: "remember", Score: 0.9979
Word: "teach", Score: 0.0008
Word: "me", Score: 0.0005
Word: "i", Score: 0.0003
Word: "involve", Score: 0.0003
Word: "and", Score: 0.0001
Word: "forget", Score: 0.0000
Word: "tell", Score: 0.0000
Word: "learn", Score: 0.0000
Word: "may", Score: 0.0000


Context: 'tell me next'
Word: "i", Score: 0.8529
Word: "remember", Score: 0.1

# 2. Simple One-Layer RNN

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Set seeds for reproducibility
torch.manual_seed(1)  # Sets the seed for PyTorch random number generators. This is important to ensure that the behavior of any operation that introduces randomness, such as the initialization of weights in neural networks and the shuffling of data in DataLoaders, is consistent across runs. Using a fixed seed value like 42 ensures that every execution of this code will produce the same results given the same inputs and model configuration. This is essential for debugging and comparing the performance impacts of any changes in a controlled manner.

# Text preprocessing
text = "Tell me and I forget, teach me and I may remember, involve me and I learn"
words = text.lower().replace(',', '').split()
vocab = {word: i for i, word in enumerate(set(words))}
vocab_size = len(vocab)

# Prepare data for the model
inputs = []
targets = []
context_size = 3

for i in range(len(words) - context_size):
    input_idx = [vocab[words[j]] for j in range(i, i + context_size)]
    target_idx = vocab[words[i + context_size]]
    inputs.append(input_idx)
    targets.append(target_idx)

inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

# Create dataset and DataLoader
dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define an RNN model
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SimpleRNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # RNN layer, unidirectional and one layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        # Output layer
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        # Input shape: (batch_size, sequence_length)
        embeds = self.embeddings(inputs)  # (batch_size, sequence_length, embedding_dim)
        rnn_out, _ = self.rnn(embeds)     # (batch_size, sequence_length, hidden_dim)
        # We use the last RNN output as the representation of the sequence
        final_output = rnn_out[:, -1, :] # (batch_size, hidden_dim)
        out = self.fc(final_output)       # (batch_size, vocab_size)
        return out

# Model parameters
embedding_dim = 10
hidden_dim = 20

# Initialize the RNN model, loss function, and optimizer
model = SimpleRNN(vocab_size, embedding_dim, hidden_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 300
for epoch in range(epochs):
    total_loss = 0
    for context, target in dataloader:
        model.zero_grad()
        log_probs = model(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(dataloader)}')
        
# Prediction function
def predict(text):
    words = text.lower().replace(',', '').split()
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]
    input_tensor = torch.tensor([input_idx], dtype=torch.long)
    with torch.no_grad():
        log_probs = model(input_tensor)
    return max(zip(log_probs[0].exp(), vocab.keys()), key=lambda p: p[0])[1]

# Test the prediction function
print(predict("teach me and"))  # Expected to output 'I', given the training context.


Epoch 0, Loss: 2.3606629967689514
Epoch 50, Loss: 0.936677560210228
Epoch 100, Loss: 0.3950011841952801
Epoch 150, Loss: 0.29567510448396206
Epoch 200, Loss: 0.4735059132799506
Epoch 250, Loss: 0.24506466882303357
i


## 2.1. Testing the prediction function with various input scenarios

In [12]:
# Test case with a context seen during training
# "and I may" is a context similar to the training data, so the model should predict accurately
print(predict("and I may"))  # Expected output: 'remember', as it follows this context in the training data

remember


In [13]:
# Test case with a partially unseen context
# "and I do" includes words seen during training but not this exact sequence
# The model might struggle or default to a more frequent or statistically likely word
print(predict("and I do"))   # Output might be less accurate due to unseen sequence

remember


In [14]:
# Test case with entirely unseen context
# "tell me next" includes 'next' which never appears in the training data
# This showcases how the model handles completely novel input
print(predict("tell me next"))  # Output is unpredictable, illustrating limitations with unseen data

i


In [15]:
# Test case with another unseen context
# "do not tell" is not only a completely unseen phrase but also contains "not," which is not in the training data
# This tests the model's handling of completely novel words and sequences
print(predict("do not tell"))   # The prediction is likely to be unreliable due to the novel context

involve


## 2.2. Enhanced prediction function with detailed probability outputs

In [16]:
# Improved Prediction Function
def predict_all_scores(text):
    """
    Predicts the probabilities of all possible next words based on the input text.
    
    Args:
    text (str): A string of text from which the last few words are taken as context for prediction.
    
    Returns:
    dict: A dictionary of words and their associated probabilities, sorted by likelihood.
    """
    # Normalize the input text: convert to lowercase and remove commas for consistent preprocessing
    words = text.lower().replace(',', '').split()

    # Convert the last few words to indices using the vocabulary.
    # Use a default index (0) if a word is not found in the vocabulary.
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]

    # Create a tensor from the word indices, suitable for model input
    input_tensor = torch.tensor([input_idx], dtype=torch.long)

    # Disable gradient calculations for efficiency during inference
    with torch.no_grad():
        # Pass the tensor through the model to get log probabilities for each class (word)
        log_probs = model(input_tensor)

        # Apply softmax to convert log probabilities to actual probabilities
        probs = torch.softmax(log_probs, dim=1)

    # Convert the probabilities tensor to a list for easier processing
    all_probs = probs.squeeze().tolist()

    # Map each word in the vocabulary to its predicted probability
    predicted_words_scores = {word: prob for word, idx in vocab.items() for prob in [all_probs[idx]]}

    # Sort the predictions by their probabilities in descending order
    predicted_words_scores = dict(sorted(predicted_words_scores.items(), key=lambda item: item[1], reverse=True))

    return predicted_words_scores

# Test the updated prediction function with different contexts
contexts = ["teach me and", "and I may", "and I do", "tell me next", "do not tell"]
for context in contexts:
    predicted_scores = predict_all_scores(context)
    print(f"Context: '{context}'")
    for word, score in predicted_scores.items():
        print(f'Word: "{word}", Score: {score:.4f}')
    print("\n")


Context: 'teach me and'
Word: "i", Score: 0.9861
Word: "learn", Score: 0.0031
Word: "forget", Score: 0.0027
Word: "may", Score: 0.0023
Word: "me", Score: 0.0017
Word: "tell", Score: 0.0013
Word: "involve", Score: 0.0012
Word: "and", Score: 0.0011
Word: "remember", Score: 0.0005
Word: "teach", Score: 0.0000


Context: 'and I may'
Word: "remember", Score: 0.9604
Word: "teach", Score: 0.0101
Word: "me", Score: 0.0092
Word: "learn", Score: 0.0071
Word: "and", Score: 0.0057
Word: "tell", Score: 0.0025
Word: "may", Score: 0.0025
Word: "forget", Score: 0.0016
Word: "i", Score: 0.0007
Word: "involve", Score: 0.0002


Context: 'and I do'
Word: "remember", Score: 0.9604
Word: "teach", Score: 0.0101
Word: "me", Score: 0.0092
Word: "learn", Score: 0.0071
Word: "and", Score: 0.0057
Word: "tell", Score: 0.0025
Word: "may", Score: 0.0025
Word: "forget", Score: 0.0016
Word: "i", Score: 0.0007
Word: "involve", Score: 0.0002


Context: 'tell me next'
Word: "i", Score: 0.8843
Word: "and", Score: 0.0451
W

## 2.3. Two-Layer RNN

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Set seeds for reproducibility
torch.manual_seed(1)  # Sets the seed for PyTorch random number generators. This is important to ensure that the behavior of any operation that introduces randomness, such as the initialization of weights in neural networks and the shuffling of data in DataLoaders, is consistent across runs. Using a fixed seed value like 42 ensures that every execution of this code will produce the same results given the same inputs and model configuration. This is essential for debugging and comparing the performance impacts of any changes in a controlled manner.

# Text preprocessing
text = "Tell me and I forget, teach me and I may remember, involve me and I learn"
words = text.lower().replace(',', '').split()
vocab = {word: i for i, word in enumerate(set(words))}
vocab_size = len(vocab)

# Prepare data for the model
inputs = []
targets = []
context_size = 3

for i in range(len(words) - context_size):
    input_idx = [vocab[words[j]] for j in range(i, i + context_size)]
    target_idx = vocab[words[i + context_size]]
    inputs.append(input_idx)
    targets.append(target_idx)

inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

# Create dataset and DataLoader
dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define a multi-layer RNN model
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(SimpleRNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # RNN with multiple layers
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        # Output layer
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        # Input shape: (batch_size, sequence_length)
        embeds = self.embeddings(inputs)  # (batch_size, sequence_length, embedding_dim)
        rnn_out, _ = self.rnn(embeds)     # (batch_size, sequence_length, hidden_dim)
        # We use the last RNN output as the representation of the sequence
        final_output = rnn_out[:, -1, :] # (batch_size, hidden_dim)
        out = self.fc(final_output)       # (batch_size, vocab_size)
        return out

# Model parameters
embedding_dim = 10
hidden_dim = 20
num_layers = 2  # Number of RNN layers

# Initialize the RNN model with multiple layers, loss function, and optimizer
model = SimpleRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 300
for epoch in range(epochs):
    total_loss = 0
    for context, target in dataloader:
        model.zero_grad()
        log_probs = model(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(dataloader)}')

# Prediction function
def predict(text):
    words = text.lower().replace(',', '').split()
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]
    input_tensor = torch.tensor([input_idx], dtype=torch.long)
    with torch.no_grad():
        log_probs = model(input_tensor)
    return max(zip(log_probs[0].exp(), vocab.keys()), key=lambda p: p[0])[1]

# Test the prediction function
print(predict("teach me and"))  # Expected to output 'I', given the training context.


Epoch 0, Loss: 2.298316478729248
Epoch 50, Loss: 0.705595888197422
Epoch 100, Loss: 0.3814241513609886
Epoch 150, Loss: 0.2792146746069193
Epoch 200, Loss: 0.2478445852175355
Epoch 250, Loss: 0.43327733781188726
i


In [18]:
# Improved Prediction Function
def predict_all_scores(text):
    """
    Predicts the probabilities of all possible next words based on the input text.
    
    Args:
    text (str): A string of text from which the last few words are taken as context for prediction.
    
    Returns:
    dict: A dictionary of words and their associated probabilities, sorted by likelihood.
    """
    # Normalize the input text: convert to lowercase and remove commas for consistent preprocessing
    words = text.lower().replace(',', '').split()

    # Convert the last few words to indices using the vocabulary.
    # Use a default index (0) if a word is not found in the vocabulary.
    input_idx = [vocab.get(word, 0) for word in words[-context_size:]]

    # Create a tensor from the word indices, suitable for model input
    input_tensor = torch.tensor([input_idx], dtype=torch.long)

    # Disable gradient calculations for efficiency during inference
    with torch.no_grad():
        # Pass the tensor through the model to get log probabilities for each class (word)
        log_probs = model(input_tensor)

        # Apply softmax to convert log probabilities to actual probabilities
        probs = torch.softmax(log_probs, dim=1)

    # Convert the probabilities tensor to a list for easier processing
    all_probs = probs.squeeze().tolist()

    # Map each word in the vocabulary to its predicted probability
    predicted_words_scores = {word: prob for word, idx in vocab.items() for prob in [all_probs[idx]]}

    # Sort the predictions by their probabilities in descending order
    predicted_words_scores = dict(sorted(predicted_words_scores.items(), key=lambda item: item[1], reverse=True))

    return predicted_words_scores

# Test the updated prediction function with different contexts
contexts = ["teach me and", "and I may", "and I do", "tell me next", "do not tell"]
for context in contexts:
    predicted_scores = predict_all_scores(context)
    print(f"Context: '{context}'")
    for word, score in predicted_scores.items():
        print(f'Word: "{word}", Score: {score:.4f}')
    print("\n")


Context: 'teach me and'
Word: "i", Score: 0.9907
Word: "involve", Score: 0.0019
Word: "teach", Score: 0.0015
Word: "me", Score: 0.0014
Word: "tell", Score: 0.0013
Word: "forget", Score: 0.0011
Word: "may", Score: 0.0009
Word: "remember", Score: 0.0007
Word: "learn", Score: 0.0003
Word: "and", Score: 0.0001


Context: 'and I may'
Word: "remember", Score: 0.9600
Word: "and", Score: 0.0152
Word: "teach", Score: 0.0132
Word: "me", Score: 0.0031
Word: "tell", Score: 0.0025
Word: "forget", Score: 0.0018
Word: "may", Score: 0.0015
Word: "i", Score: 0.0012
Word: "learn", Score: 0.0008
Word: "involve", Score: 0.0006


Context: 'and I do'
Word: "remember", Score: 0.9600
Word: "and", Score: 0.0152
Word: "teach", Score: 0.0132
Word: "me", Score: 0.0031
Word: "tell", Score: 0.0025
Word: "forget", Score: 0.0018
Word: "may", Score: 0.0015
Word: "i", Score: 0.0012
Word: "learn", Score: 0.0008
Word: "involve", Score: 0.0006


Context: 'tell me next'
Word: "i", Score: 0.9802
Word: "forget", Score: 0.004