<a href="https://colab.research.google.com/github/Rajesh24mcs115/24mcs115-Exp7-RajesKumarPal/blob/main/Experiment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk

# Download required corpora and tokenization resources
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('punkt_tab')  # Requested modification

from nltk.corpus import gutenberg

# Combine all texts in the Gutenberg corpus into a single large string
texts = [gutenberg.raw(fileid) for fileid in gutenberg.fileids()]
text = "\n".join(texts).lower()  # Convert to lowercase
print("Total length of Gutenberg corpus (characters):", len(text))


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Total length of Gutenberg corpus (characters): 11793335


In [2]:
# Tokenize the text (punctuation is preserved as tokens)
tokens = nltk.word_tokenize(text)
print("Total tokens:", len(tokens))


Total tokens: 2539731


In [3]:
from collections import Counter

# Define the vocabulary size (around 300 tokens as required)
vocab_size = 300

# Count token frequencies and select the top vocab_size tokens
counter = Counter(tokens)
most_common = counter.most_common(vocab_size)
vocab = [word for word, count in most_common]

# Create mapping dictionaries for word-to-index and index-to-word
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

print("Vocabulary Size:", len(vocab))
print("Sample vocabulary:", vocab[:20])


Vocabulary Size: 300
Sample vocabulary: [',', 'the', 'and', '.', 'of', 'to', 'a', 'in', 'i', 'that', ';', 'he', 'it', 'his', 'for', 'was', 'not', 'with', "''", 'is']


In [4]:
# Extract all 4‑grams (each 4‑gram is a sequence of 4 adjacent tokens)
# Only include 4‑grams where every token is in our vocabulary.
fourgrams = []
for i in range(len(tokens) - 3):
    gram = tokens[i:i+4]
    if all(word in vocab for word in gram):
        fourgrams.append(gram)

print("Total 4-grams extracted:", len(fourgrams))
total_required = 400000 + 50000 + 50000  # Target: 500K total 4-grams
if len(fourgrams) < total_required:
    print("Warning: Not enough 4-grams available. The available data will be used for splitting.")


Total 4-grams extracted: 563786


In [5]:
import numpy as np

# For each 4‑gram, the first three tokens are the input and the fourth token is the target (label)
inputs = []
labels = []
for gram in fourgrams:
    input_seq = [word2idx[word] for word in gram[:3]]
    label = word2idx[gram[3]]
    inputs.append(input_seq)
    labels.append(label)

inputs = np.array(inputs)
labels = np.array(labels)
print("Input shape:", inputs.shape)
print("Labels shape:", labels.shape)


Input shape: (563786, 3)
Labels shape: (563786,)


In [6]:
import random

num_samples = len(inputs)
indices = list(range(num_samples))
random.shuffle(indices)

train_end = int(0.8 * num_samples)
val_end = int(0.9 * num_samples)

X_train = inputs[indices[:train_end]]
y_train = labels[indices[:train_end]]
X_val = inputs[indices[train_end:val_end]]
y_val = labels[indices[train_end:val_end]]
X_test = inputs[indices[val_end:]]
y_test = labels[indices[val_end:]]

print("Training samples:", len(X_train))
print("Validation samples:", len(X_val))
print("Test samples:", len(X_test))


Training samples: 451028
Validation samples: 56379
Test samples: 56379


In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Dropout, Input
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization

embedding_dim = 50  # Embedding dimension

# ------------------ RNN Model (LSTM) ------------------
# Removed the deprecated `input_length` argument.
rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="rnn_embedding"),
    LSTM(128, return_sequences=False),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])
# Explicitly build the model with input shape (None, 3) to initialize parameters.
rnn_model.build(input_shape=(None, 3))
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_model.summary()

# ------------------ Transformer Model ------------------
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

input_layer = Input(shape=(3,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="transformer_embedding")(input_layer)
transformer_block = TransformerBlock(embed_dim=embedding_dim, num_heads=4, ff_dim=128)(embedding_layer)
flatten = Flatten()(transformer_block)
output_layer = Dense(vocab_size, activation="softmax")(flatten)

transformer_model = Model(inputs=input_layer, outputs=output_layer)
transformer_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
transformer_model.summary()


In [8]:
epochs = 10      # Adjust epochs as needed
batch_size = 128 # Batch size for training

print("\nTraining RNN Model...")
rnn_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))

print("\nTraining Transformer Model...")
transformer_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))



Training RNN Model...
Epoch 1/10
[1m3524/3524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 12ms/step - accuracy: 0.1619 - loss: 4.2255 - val_accuracy: 0.2481 - val_loss: 3.4526
Epoch 2/10
[1m3524/3524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 12ms/step - accuracy: 0.2552 - loss: 3.3847 - val_accuracy: 0.2645 - val_loss: 3.3075
Epoch 3/10
[1m3524/3524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 12ms/step - accuracy: 0.2696 - loss: 3.2518 - val_accuracy: 0.2743 - val_loss: 3.2387
Epoch 4/10
[1m3524/3524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 12ms/step - accuracy: 0.2805 - loss: 3.1753 - val_accuracy: 0.2811 - val_loss: 3.1964
Epoch 5/10
[1m3524/3524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 12ms/step - accuracy: 0.2857 - loss: 3.1273 - val_accuracy: 0.2843 - val_loss: 3.1725
Epoch 6/10
[1m3524/3524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 12ms/step - accuracy: 0.2909 - loss: 3.0808 - val_accuracy: 0.2868 

<keras.src.callbacks.history.History at 0x796a2014dbd0>

In [9]:
# Evaluate RNN Model on test set
rnn_loss, rnn_acc = rnn_model.evaluate(X_test, y_test)
print("RNN Model - Test Loss:", rnn_loss, "Test Accuracy:", rnn_acc)

# Evaluate Transformer Model on test set
transformer_loss, transformer_acc = transformer_model.evaluate(X_test, y_test)
print("Transformer Model - Test Loss:", transformer_loss, "Test Accuracy:", transformer_acc)


[1m1762/1762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.2880 - loss: 3.1390
RNN Model - Test Loss: 3.1303884983062744 Test Accuracy: 0.28923889994621277
[1m1762/1762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.2819 - loss: 3.1868
Transformer Model - Test Loss: 3.1814510822296143 Test Accuracy: 0.2809556722640991


In [12]:
def predict_next_word(model, word_sequence):
    # Convert words to indices; default to index 0 if a word is not found
    seq = [word2idx.get(word, 0) for word in word_sequence]
    seq = np.array(seq).reshape(1, -1)
    pred_probs = model.predict(seq)
    predicted_index = np.argmax(pred_probs, axis=1)[0]
    return idx2word[predicted_index]

# Example sequences for next‑word prediction (more common sequences added):
sequences = [
    ["government", "of", "united"],
    ["city", "of", "new"],
    ["life", "in", "the"],
    ["he", "is", "the"],
    ["at", "the", "end"],
    ["in", "the", "middle"],
    ["this", "is", "a"],
    ["one", "of", "the"],
    ["it", "was", "a"]
]

print("\nNext-word Predictions:")
for seq in sequences:
    next_word_rnn = predict_next_word(rnn_model, seq)
    next_word_trans = predict_next_word(transformer_model, seq)
    print(f"Input: {seq}")
    print(f"  RNN Prediction: {next_word_rnn}")
    print(f"  Transformer Prediction: {next_word_trans}")



Next-word Predictions:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Input: ['government', 'of', 'united']
  RNN Prediction: as
  Transformer Prediction: and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Input: ['city', 'of', 'new']
  RNN Prediction: and
  Transformer Prediction: and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Input: ['life', 'in', 'the']
  RNN Prediction: world
  Transformer Prediction: world
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Input: ['he', 'is', 'the']
  RNN Prediction: very
  Transformer Prediction: lord
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46

In [17]:
# Extract learned embeddings from both models
rnn_embeddings = rnn_model.get_layer("rnn_embedding").get_weights()[0]
transformer_embeddings = transformer_model.get_layer("transformer_embedding").get_weights()[0]

def cosine_similarity(vec1, vec2, epsilon=1e-10):
  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) + epsilon)

def find_nearest_words(target_word, embeddings, word2idx, idx2word, top_n=5):
    if target_word not in word2idx:
        return f"Word '{target_word}' not in vocabulary."
    target_vec = embeddings[word2idx[target_word]]
    similarities = [(idx2word[idx], cosine_similarity(target_vec, embeddings[idx]))
                    for idx in range(len(embeddings))]
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[1:top_n+1]  # Exclude the target word itself

# Using words known to be in the Reuters vocabulary:
test_words = ["day", "could", "said", "for"]

print("\n==== RNN Model Nearest Words ====")
for word in test_words:
    print(f"Nearest words to '{word}' (RNN):", find_nearest_words(word, rnn_embeddings, word2idx, idx2word))

print("\n==== Transformer Model Nearest Words ====")
for word in test_words:
    print(f"Nearest words to '{word}' (Transformer):", find_nearest_words(word, transformer_embeddings, word2idx, idx2word))

def cosine_distance(word1, word2, embeddings, word2idx):
    if word1 not in word2idx or word2 not in word2idx:
        return f"One or both words not in vocabulary."
    vec1 = embeddings[word2idx[word1]]
    vec2 = embeddings[word2idx[word2]]
    return 1 - cosine_similarity(vec1, vec2)

# Example: Cosine distance between 'said' and 'it'
distance_rnn = cosine_distance("said", "it", rnn_embeddings, word2idx)
distance_trans = cosine_distance("said", "it", transformer_embeddings, word2idx)
print(f"\nCosine distance between 'said' and 'it' (RNN): {distance_rnn}")
print(f"Cosine distance between 'said' and 'it' (Transformer): {distance_trans}")



==== RNN Model Nearest Words ====
Nearest words to 'day' (RNN): [('days', np.float32(0.6602367)), ('night', np.float32(0.61013925)), ('side', np.float32(0.5977876)), ('thereof', np.float32(0.510229)), ('city', np.float32(0.48644426))]
Nearest words to 'could' (RNN): [('can', np.float32(0.7042334)), ('should', np.float32(0.6752854)), ('must', np.float32(0.5555664)), ('would', np.float32(0.5505268)), ('shall', np.float32(0.5119374))]
Nearest words to 'said' (RNN): [('saying', np.float32(0.75261194)), ('say', np.float32(0.70133376)), ('saith', np.float32(0.6533459)), ('cried', np.float32(0.5755853)), ('answered', np.float32(0.57381123))]
Nearest words to 'for' (RNN): [('but', np.float32(0.47366503)), ('after', np.float32(0.41693103)), ('upon', np.float32(0.4085696)), ('behold', np.float32(0.39826405)), ('with', np.float32(0.3837102))]

==== Transformer Model Nearest Words ====
Nearest words to 'day' (Transformer): [('night', np.float32(0.65256286)), ('time', np.float32(0.49492806)), ('da