#Simple Language Model with Transformer
We will now explore using a simple version of a Transformer model from PyTorch, a popular deep learning framework.



#Preliminaries

In [None]:
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import requests

# URL to the raw text file on GitHub
url = 'https://raw.githubusercontent.com/RDGopal/IB9CW0-Text-Analytics/main/Data/tinyshakespeare.txt'

# Use requests to get the content of the file
response = requests.get(url)

# Ensure the request was successful
if response.status_code == 200:
    s_text = response.text
    # Continue processing the text as needed
else:
    print("Failed to retrieve the file. Status code:", response.status_code)

# Tokenize the text
tokens = word_tokenize(s_text)

# Organize the tokens into sentences, Word2Vec needs data in the format of list of lists of tokens
sentences = [tokens[i:i+100] for i in range(0, len(tokens), 100)]

# Train the CBOW model
word2vec_model = Word2Vec(sentences, vector_size=5, window=5, min_count=1, sg=0)

In [None]:
# Generate training data for the GPT model
def generate_training_data(tokens, model_wv, context_size):
    X, y = [], []
    sequence = [word for word in tokens if word in model_wv.key_to_index]
    for i in range(len(sequence) - context_size):
        context_words = sequence[i:i + context_size]
        target_word = sequence[i + context_size]
        context_embeddings = [model_wv[word] for word in context_words]
        target_embedding = model_wv[target_word]
        X.append(context_embeddings)
        y.append(target_embedding)
    return np.array(X), np.array(y)

context_size = 5
X, y = generate_training_data(tokens, word2vec_model.wv, context_size)
X_train_tensor = torch.tensor(X, dtype=torch.float32)
y_train_tensor = torch.tensor(y, dtype=torch.float32)

#GPT Model

The structure of the model is as follows:

![picture](https://drive.google.com/uc?export=view&id=1o45KgGRYze9_G7jI65q4vA5PKTBsGyzc)


Key Elements of the Model are the following.

**Position Embeddings:**

Adds position information to the embeddings, crucial for maintaining sequence order understanding within the transformer.

**Transformer Encoder:**

Processes the entire input sequence with self-attention and feed-forward layers, allowing the model to understand and utilize contextual information from the entire sequence.

**Output Layer:**

Maps the transformer's output (specifically from the last token) back to the embedding dimension. This output is intended to be a prediction of the embedding of the next word.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EmbeddingPredictorGPT(nn.Module):
    def __init__(self, embed_dim, num_heads, num_layers, max_seq_length):
        super(EmbeddingPredictorGPT, self).__init__()
        self.embed_dim = embed_dim
        self.max_seq_length = max_seq_length
        self.position_embeddings = nn.Parameter(torch.zeros(1, max_seq_length, embed_dim))

        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, dim_feedforward=embed_dim * 4, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Output layer that predicts the embedding of the next word
        self.output_layer = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        # x is expected to be of shape (batch_size, seq_length, embed_dim)
        # Add position embeddings
        seq_length = x.size(1)
        position_encoded = x + self.position_embeddings[:, :seq_length, :]

        # Passing through the transformer encoder
        transformed = self.transformer(position_encoded)

        # Using the output from the last token to predict the next embedding
        prediction = self.output_layer(transformed[:, -1, :])
        return prediction


#Loss Function

We will construct a loss function based on cosine similarity between the predicted embedding and the target embedding. The loss is defined as the 1 - cosine similarity. Note that the loss value will range from 0 to 2. Closer to 0 the final trained model loss, better is the quality of learning from the training data.

**The training takes a long while to complete.**

*I have saved the trained model as `language_model_3.pth`. You can load the trained to run inference (i.e. next word prediction and text generation).*

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

model = EmbeddingPredictorGPT(embed_dim=5, num_heads=1, num_layers=3, max_seq_length=5)

# Define cosine similarity and criterion as a loss
cosine_similarity = nn.CosineSimilarity(dim=1)

def criterion(output, target):
    # Subtracting from 1 converts similarity to dissimilarity (1 means identical, 0 means orthogonal)
    loss = 1 - cosine_similarity(output, target)
    return loss.mean()  # Ensure scalar output by taking mean across all data

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(200):  # epochs
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    print(f"Outputs shape: {outputs.shape}, Loss value: {loss}")  # Debugging output shapes and loss
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')


# Save the entire model
torch.save(model, 'language_model_3.pth')


Load the trained model as follows.

In [None]:
# Later you can load it with
gpt_model = torch.load('language_model_3.pth')
gpt_model.eval()  # Set the model to evaluation mode if you're loading for inference

Compute the loss value for the prediction model on the training data

In [None]:
import torch
import torch.nn as nn

# Define the cosine similarity function
cosine_similarity = nn.CosineSimilarity(dim=1)

# Custom cosine dissimilarity function for calculating loss
def cosine_dissimilarity(output, target):
    # Calculate the cosine dissimilarity
    return 1 - cosine_similarity(output, target)

# No need to track gradients here as we're only evaluating
with torch.no_grad():
    # Run the model on the training data
    training_outputs = gpt_model(X_train_tensor)

    # Compute the cosine dissimilarity between the outputs and the actual targets
    training_loss = cosine_dissimilarity(training_outputs, y_train_tensor).mean()  # Ensure it's a scalar by taking the mean

    # Print the computed loss
    print(f'Training Loss: {training_loss.item()}')


# Number of parameters
The following code computes the number of parameters that are estimated in out model.


In [None]:
def count_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return trainable_params

trainable_parameters = count_trainable_parameters(gpt_model)
print(f'Number of trainable parameters: {trainable_parameters}')

#Next Word Prediction
* Word2Vec Model (word2vec_model): This model is trained on the sentences extracted from the text and used to generate embeddings.

* SimpleGPT Model (gpt_model): This is the transformer-based model used for training on sequences of embeddings to predict the next word's embedding.

* Predict Next Words Function: This function takes an input text, processes it through the trained GPT model (gpt_model), and uses the Word2Vec model (word2vec_model.wv) to find the closest words to the predicted embedding.

In [None]:
def predict_next_words(input_text, model, model_wv, num_predictions=5):
    tokens = word_tokenize(input_text)
    last_words = tokens[-5:]  # Get the last 5 words from the input text
    last_words = [word for word in last_words if word in model_wv.key_to_index]  # Filter out words not in model vocabulary
    input_embeddings = np.array([model_wv[word] for word in last_words])
    input_tensor = torch.tensor(input_embeddings, dtype=torch.float32).unsqueeze(0)  # Convert to tensor and add batch dimension

    model.eval()
    with torch.no_grad():
        output_embeddings = model(input_tensor)
        predicted_embedding = output_embeddings[0]

    # Get all embeddings from the Word2Vec model
    all_embeddings = torch.tensor([model_wv[word] for word in model_wv.index_to_key], dtype=torch.float32)
    cos = torch.nn.CosineSimilarity(dim=1)
    similarities = cos(predicted_embedding.unsqueeze(0), all_embeddings)
    top_indices = similarities.topk(num_predictions).indices
    closest_words = [model_wv.index_to_key[idx] for idx in top_indices]
    return closest_words


In [None]:
# Example usage
input_text = "to be or not" # @param {type:"string"}
predicted_words = predict_next_words(input_text, gpt_model, word2vec_model.wv)

In [None]:
predicted_words

#Text Generation
* Temperature: This parameter controls how much to weigh the probabilities. A lower temperature makes the model more confident (less random), while a higher temperature makes the choices softer, increasing diversity.

* Probabilistic Choice: Instead of selecting the word with the highest cosine similarity, the function now converts similarities into probabilities (using softmax) and samples from these probabilities, which introduces randomness into the word selection process.

In [None]:
import torch
import torch.nn.functional as F
import numpy as np

def generate_text(model, model_wv, starting_sequence, num_words=5, temperature=1.0):
    """
    Generate text using the trained GPT model and Word2Vec embeddings with added randomness.

    Args:
        model (torch.nn.Module): The trained GPT model.
        model_wv (gensim.models.keyedvectors.KeyedVectors): Word2Vec model's embeddings.
        starting_sequence (str): The initial text sequence to start text generation.
        num_words (int): Number of words to generate.
        temperature (float): Temperature parameter to control randomness of predictions.

    Returns:
        str: The generated text.
    """
    model.eval()  # Set the model to evaluation mode
    text = starting_sequence
    words = starting_sequence.split()

    for _ in range(num_words):
        last_words = words[-5:]  # Only use the last 5 words or fewer if available
        input_embeddings = np.array([model_wv[word] for word in last_words if word in model_wv.key_to_index])
        input_tensor = torch.tensor(input_embeddings, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

        # Predict the next word's embedding
        with torch.no_grad():
            output_embeddings = model(input_tensor)  # Removed positions argument
            predicted_embedding = output_embeddings[0]

        # Find the closest word in the Word2Vec vocabulary
        all_embeddings = torch.tensor([model_wv[word] for word in model_wv.index_to_key], dtype=torch.float32)
        cos = torch.nn.CosineSimilarity(dim=1)
        similarities = cos(predicted_embedding.unsqueeze(0), all_embeddings)

        # Apply softmax to get probabilities
        probabilities = F.softmax(similarities / temperature, dim=0)
        next_word_idx = torch.multinomial(probabilities, 1).item()
        next_word = model_wv.index_to_key[next_word_idx]

        # Append the predicted word to the text
        text += ' ' + next_word
        words.append(next_word)

    return text

In [None]:
# Example usage
starting_sequence = "to be or not to" # @param {type:"string"}
num_generated_words = 20
# Temperature - Lower for more conservative predictions, higher for more diversity
temperature = 0.48  # @param {type:"slider", min:0, max:1, step:0.01}
alpha = 0.5

generated_text = generate_text(gpt_model, word2vec_model.wv, starting_sequence, num_generated_words, temperature)

In [None]:
generated_text