<a href="https://colab.research.google.com/github/RDGopal/IB9CW0-Text-Analytics/blob/main/Simple_Language_Model_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Simple Language Model with Positional Embeddings
We will expand the previous simple language model by adding positional embeddings. First we go through the preliminaries to get the model ready to train.

In [None]:
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import requests

# URL to the raw text file on GitHub
url = 'https://raw.githubusercontent.com/RDGopal/IB9CW0-Text-Analytics/main/Data/tinyshakespeare.txt'

# Use requests to get the content of the file
response = requests.get(url)

# Ensure the request was successful
if response.status_code == 200:
    s_text = response.text
    # Continue processing the text as needed
else:
    print("Failed to retrieve the file. Status code:", response.status_code)

# Print the first 500 characters
print(s_text[:500])

# Tokenize the text
tokens = word_tokenize(s_text)

# Organize the tokens into sentences, Word2Vec needs data in the format of list of lists of tokens
sentences = [tokens[i:i+100] for i in range(0, len(tokens), 100)]

# Train the CBOW model
word2vec_model = Word2Vec(sentences, vector_size=5, window=5, min_count=1, sg=0)

#Tokenizer
Once we get the embeddings, we will store the words (tokens), token ids, and embeddings in a dataframe.

Note that the number of distinct words (tokens) is 14310. This is the size of our vocabulary.

In [None]:
import pandas as pd
# Create a DataFrame to store word, token_id, and embedding
data = {
    'word': [],
    'token_id': [],
    'embedding': []
}

for idx, word in enumerate(word2vec_model.wv.index_to_key):
    data['word'].append(word)
    data['token_id'].append(idx)
    data['embedding'].append(word2vec_model.wv[word].tolist())  # convert numpy array to list for easier handling in DataFrame

df = pd.DataFrame(data)
print(df)

##Positional Embeddings
Positional embeddings are used primarily to incorporate information about the position of tokens in the input sequence into the model. The idea is to add a vector to each token's embedding that represents its position in the sequence, ensuring that the order of tokens contributes to the model's understanding.

In the original Transformer architecture, positional embeddings are computed using sine and cosine functions of different frequencies:

$$
\text{PE}(pos, 2i) = \sin\left(\frac{pos}{10000^{2i/d_{\text{model}}}}\right) \\
\text{PE}(pos, 2i+1) = \cos\left(\frac{pos}{10000^{2i/d_{\text{model}}}}\right)
$$


Where:

$pos$ is the position of the token in the sequence.

$i$ is the dimension index.

$
d_{model}
$ is the dimensionality of the token embeddings.

This formula helps the model to differentiate positions by providing a unique signal for each position, and the repeating patterns allow the model to generalize to sequence lengths that it has not seen before.

In [None]:
import numpy as np

def get_positional_embeddings(sequence_length, embedding_dim):
    positional_embeddings = np.zeros((sequence_length, embedding_dim))
    for pos in range(sequence_length):
        for i in range(embedding_dim):
            if i % 2 == 0:
                positional_embeddings[pos, i] = np.sin(pos / (10000 ** ((2 * i) / embedding_dim)))
            else:
                positional_embeddings[pos, i] = np.cos(pos / (10000 ** ((2 * i) / embedding_dim)))
    return positional_embeddings

You can view positional embeddings as follows.

In [None]:
pos_embeddings_matrix = get_positional_embeddings(5,5)

# Set the NumPy print options to suppress scientific notation
np.set_printoptions(suppress=True, precision=3)  # Set precision as desired

# Display the positional embeddings
print(pos_embeddings_matrix)

#Training Data
Our main objective is to predict the next word (token) based on the previous 5 words (tokens). Thus, our context length is 5.

We will prepare the training data such that inputs are 5 consecutive words (token) and the output to be predicted is the 6th word (token). If the input has less than 5 words (tokens), we will pad it with \<pad>.

In [None]:
def generate_training_data(sentences, model_wv, window_size=5):
    embedding_dim = model_wv.vector_size
    X, y = [], []
    sequence_texts = []  # For storing the actual sequences of words
    next_words = []  # For storing the actual next word
    positional_embeddings = get_positional_embeddings(window_size, embedding_dim)

    for sentence in sentences:
        # Embed words using the Word2Vec model
        embedded_sentence = [model_wv[word] for word in sentence if word in model_wv]
        word_sentence = [word for word in sentence if word in model_wv]  # Keep the actual words for viewing

        # Create sequences
        for i in range(len(embedded_sentence)):
            end_ix = i + window_size
            if end_ix >= len(embedded_sentence):
                break
            seq_x, seq_y = embedded_sentence[i:end_ix], embedded_sentence[end_ix]
            seq_text, next_word = word_sentence[i:end_ix], word_sentence[end_ix]

            # Pad sequence if necessary
            padding_length = window_size - len(seq_x)
            seq_x += [np.zeros(embedding_dim)] * padding_length
            seq_text += ['<pad>'] * padding_length

            # Add positional embeddings
            modified_seq_x = np.array(seq_x) + positional_embeddings[:len(seq_x)]

            X.append(modified_seq_x.flatten())
            y.append(seq_y)
            sequence_texts.append(' '.join(seq_text))
            next_words.append(next_word)

    return np.array(X), np.array(y), sequence_texts, next_words

X_train, y_train, train_sequences, train_next_words = generate_training_data(sentences, word2vec_model.wv)
# Create DataFrame
train_df = pd.DataFrame({
    'Sequence': train_sequences,
    'Next Word': train_next_words,
    'X_train (Flattened Embeddings)': list(X_train),
    'y_train (Embedding)': list(y_train)
})


In [None]:
train_df.head()

Our training dataset has 241,779 data points. Each data point has 6 words (tokens), and thus the total number of words (tokens) for training is 241,779 * 6 = 1,450,674.

In [None]:
train_df.shape
# train_df.head()

#Neural Network Design
The neural network we will train has the same structure as before, with the only exception that the positional embeddings are added to the embeddings of the input words (tokens).

![picture](https://drive.google.com/uc?export=view&id=1TA3f22fqppMMSwKYze-cAOWKpv7Vx3TB)




In [None]:
from keras.models import Sequential
from keras.layers import Dense

def build_model(input_dim, hidden_neurons, output_dim):
    model = Sequential([
        Dense(hidden_neurons, input_dim=input_dim, activation='relu'),
        Dense(output_dim, activation='linear')  # Assuming you want the raw embedding as output
    ])
    model.compile(optimizer='adam', loss='mse')
    return model


# Build and train the model
**Takes long to train - several hours on my machine**

**I have saved the trained model (`language_model_2.h5`). To run the trained model you simply have to load the saved model and run it. **

In [None]:
# Build model
nn_model = build_model(25, 10, 5)

# Train the model
nn_model.fit(X_train, y_train, epochs=20, batch_size=1)

from keras.models import load_model

# Save the trained model
nn_model.save('language_model_2.h5')  # Saves the model to your hard drive

Save the model for later use

#Load the trained model

In [None]:
from keras.models import load_model

# Load the model from the disk
loaded_model = load_model('language_model_2.h5')

Compute the loss function over the training data.

In [None]:
# You need to prepare your data in the same way it was prepared during model training
loss= loaded_model.evaluate(X_train, y_train)
print(f"Loss: {loss}")

The above results indicate this is a better language model than version v1 earlier.

#Next Word Prediction
Once the model is trained, we can use it for the next word (token) prediction.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def predict_next_words(model, input_sequence, word_vectors, top_n=5):
    # Predict the embedding
    prediction = model.predict(np.array([input_sequence]))[0]

    # Calculate cosine similarity with all words
    all_similarities = cosine_similarity([prediction], word_vectors.vectors)[0]

    # Find the top 5 words with the highest similarity
    top_indices = np.argsort(-all_similarities)[:top_n]  # Negative for descending order
    closest_words = [(word_vectors.index_to_key[i], all_similarities[i]) for i in top_indices]

    return closest_words


Run the prediction model

In [None]:
# Test the loaded model
test_sequence = "who for many years" # @param {type:"string"}
test_tokens = word_tokenize(test_sequence)
test_embedded = [word2vec_model.wv[word] for word in test_tokens if word in word2vec_model.wv]
test_input = np.concatenate(test_embedded[:5])  # Simplified example

vector_size = 5 # the model expects 5 words in the prompt

# Ensure there are exactly 5 embeddings, pad if fewer
if len(test_embedded) < 5:
    # Pad with zero-filled vectors
    test_embedded += [np.zeros(vector_size) for _ in range(5 - len(test_embedded))]

# Flatten the list of embeddings to match input shape, and ensure it's truncated to exactly 5 words
test_input = np.concatenate(test_embedded[:5])

predicted_words = predict_next_words(loaded_model, test_input, word2vec_model.wv)


In [None]:
print("Predicted next words:")
for word, similarity in predicted_words:
    print(f"{word}")

#Text Generation with Randomness
To make the outputted text more interesting, we will inject a bit of randomness.

Text generation function

In [None]:
def predict_next_words_with_probabilities(model, input_sequence, word_vectors, top_n=20):
    # Predict the embedding
    prediction = model.predict(np.array([input_sequence]))[0]

    # Calculate cosine similarity with all words
    all_similarities = cosine_similarity([prediction], word_vectors.vectors)[0]

    # Get the top 5 indices and scores
    top_indices = np.argsort(-all_similarities)[:top_n]
    top_scores = all_similarities[top_indices]

    # Convert scores to probabilities using softmax
    top_probabilities = np.exp(top_scores) / np.sum(np.exp(top_scores))

    # Ensure the probabilities sum to 1
    top_probabilities /= top_probabilities.sum()

    return [(word_vectors.index_to_key[i], top_probabilities[j]) for j, i in enumerate(top_indices)]



In [None]:
def generate_text(model, initial_text, word_vectors, num_words, vector_size=5):
    tokens = word_tokenize(initial_text)
    current_embeddings = [word_vectors[word] for word in tokens if word in word_vectors]

    generated_words = tokens.copy()

    for _ in range(num_words):
        if len(current_embeddings) < 5:
            padded_embeddings = current_embeddings + [np.zeros(vector_size) for _ in range(5 - len(current_embeddings))]
        else:
            padded_embeddings = current_embeddings[-5:]

        input_sequence = np.concatenate(padded_embeddings)

        next_word_options = predict_next_words_with_probabilities(model, input_sequence, word_vectors)

        words, probabilities = zip(*next_word_options)

        # Normalize probabilities to ensure they sum to 1
        probabilities = np.array(probabilities)
        probabilities /= probabilities.sum()

        next_word = np.random.choice(words, p=probabilities)

        generated_words.append(next_word)
        current_embeddings.append(word_vectors[next_word])

    return ' '.join(generated_words)



In [None]:
%%capture

initial_text = "who for many years" # @param {type:"string"}
num_words_to_generate = 40 # @param {type:"integer"}
generated_text = generate_text(loaded_model, initial_text, word2vec_model.wv, num_words_to_generate)

In [None]:
generated_text

While this is a better language model than the earlier version, given that this is a 'toy' example, the outputs generated are still not very impressive.