Loading Data

In [1]:
data_path = "Datasets/data.txt"

texts = []

with(open(data_path, "r", encoding="utf-8", errors="ignore") as f):
    for line in f:
        line = line.strip()
        if line:
            texts.append(line)

In [2]:
texts[:5], len(texts)  

(['I never thought I’d see you again after all these years.',
  'Life has strange ways of bringing people back together when least expected.',
  'The evidence doesn’t add up. The fingerprints on the weapon belong to someone who wasn’t even at the crime scene that night.',
  'We’ve tried every possible treatment, but his condition remains stable. The next few hours will be critical for his full recovery.',
  'Your mission is simple: retrieve the stolen data, avoid enemy surveillance, and ensure nobody knows you were ever there.'],
 552)

In [3]:
import random

random.seed(42)
random.shuffle(texts)

We can See a lot of noise of blank strings...So preprocessing Texts

In [4]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9?.!,']+", " ", text)
    text = re.sub(r"[()]", "", text)
    text = re.sub(r"\.{2,}", ".", text)
    text = re.sub(r"\,{2,}", ",", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [5]:
import random

texts = [clean_text(text) for text in texts]
random.seed(42)
random.shuffle(texts)

In [6]:
texts[:5]

['i really appreciate your help with this.',
 'i heard there s a new caf opening downtown next weekend.',
 'do you want to split the tasks to get them done faster?',
 'it s just a flesh wound.',
 'could you help me carry these boxes upstairs?']

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = 5000, oov_token="<oov>")
tokenizer.fit_on_texts(texts)

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

2025-07-03 14:36:35.916936: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-03 14:36:35.941583: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-03 14:36:35.941613: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-03 14:36:35.941634: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-03 14:36:35.946581: I tensorflow/core/platform/cpu_feature_g

Vocabulary size: 1502


In [8]:
train_sequences = tokenizer.texts_to_sequences(texts)

In [9]:
train_sequences[:10]

[[5, 62, 110, 12, 111, 21, 10],
 [5, 168, 37, 8, 7, 66, 633, 634, 386, 63, 76],
 [34, 3, 77, 4, 267, 2, 268, 4, 49, 143, 269, 270],
 [9, 8, 59, 7, 635, 636],
 [64, 3, 111, 17, 387, 144, 637, 638],
 [34, 3, 23, 42, 271, 13, 2, 76],
 [2, 388, 639, 100, 35, 10, 640, 6, 44, 641, 642, 643, 60],
 [5, 27, 389, 3, 24, 37, 26, 42, 272, 201],
 [6, 390, 38, 391, 644],
 [2, 202, 15, 645, 646, 3]]

In [10]:
def get_ngrams(sequences):
    ngrams = []
    for seq in sequences:
        for i in range(1, len(seq)):
            n_gram_seq = seq[:i + 1]
            ngrams.append(n_gram_seq)
    
    return ngrams

In [11]:
train_tokens = get_ngrams(train_sequences)

In [12]:
train_tokens[:10]

[[5, 62],
 [5, 62, 110],
 [5, 62, 110, 12],
 [5, 62, 110, 12, 111],
 [5, 62, 110, 12, 111, 21],
 [5, 62, 110, 12, 111, 21, 10],
 [5, 168],
 [5, 168, 37],
 [5, 168, 37, 8],
 [5, 168, 37, 8, 7]]

In [13]:
max_length = max(len(seq) for seq in train_tokens)
max_length

24

In [14]:
# # 160 is a big number so we will keep it to 40
# max_length = 40

# train_tokens = [token for token in train_tokens if len(token) <= max_length]
# val_tokens = [token for token in val_tokens if len(token) <= max_length]
# test_tokens = [token for token in test_tokens if len(token) <= max_length]

# print(f"Final train size: {len(train_tokens)}")
# print(f"Final validation size: {len(val_tokens)}")
# print(f"Final test size: {len(test_tokens)}")

In [15]:
train_tokens = pad_sequences(train_tokens, maxlen=max_length, padding='pre')
print(f"Train tokens shape: {train_tokens.shape}")

Train tokens shape: (5433, 24)


In [16]:
train_tokens[:10]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   5,  62],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   5,  62, 110],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   5,  62, 110,  12],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   5,  62, 110,  12, 111],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   5,  62, 110,  12, 111,  21],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   5,  62, 110,  12, 111,  21,  10],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   5, 168],
       [  0,   0,   0,   0,   0,   0,   0

In [17]:
import tensorflow as tf

X_train = train_tokens[:, :-1]
y_train = train_tokens[:, -1]
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (5433, 23), y_train shape: (5433,)


In [18]:
X_train[:5], y_train[:5]

(array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   5],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   5,  62],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   5,  62, 110],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   5,  62, 110,  12],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   5,  62, 110,  12, 111]], dtype=int32),
 array([ 62, 110,  12, 111,  21], dtype=int32))

Using GLOVE 6B 100D pretrained model as embedding layer

In [19]:
import numpy as np

# Set your file path
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coeffs

print("Found %s word vectors." % len(embedding_index))

Found 400000 word vectors.


In [20]:
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        # Random initialization for OOV words (optional)
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))


In [21]:
embedding_matrix.shape, embedding_matrix[:5]

((1502, 100),
 array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00, 

In [22]:
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from keras.models import Sequential
from functools import partial

embedding_dim = 100
Embedding = partial(Embedding, weights=[embedding_matrix], trainable=False)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length-1, name='embedding_layer'),
    Bidirectional(LSTM(64, name='bidirectional_lstm')),
    Dense(128, activation='relu', name='dense_layer'),
    BatchNormalization(),
    Dropout(0.3, name='dropout_layer'),
    Dense(vocab_size, activation='softmax', name='output_layer')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

2025-07-03 14:36:42.862069: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-03 14:36:42.866488: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-03 14:36:42.867534: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding  (None, 23, 100)           150200    
 )                                                               
                                                                 
 bidirectional (Bidirection  (None, 128)               84480     
 al)                                                             
                                                                 
 dense_layer (Dense)         (None, 128)               16512     
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                                  
                                                                 
 dropout_layer (Dropout)     (None, 128)               0         
                                                        

In [23]:
from keras.callbacks import ReduceLROnPlateau
from helper_functions import create_tensorboard_callback

lr_reduce = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=1, verbose=1)

In [24]:
history = model.fit(X_train,
                    y_train,
                    epochs=50,
                    batch_size=16,
                    callbacks=[create_tensorboard_callback("tensorboard_logs", "glove6B100D_lstm"), lr_reduce])

Saving TensorBoard log files to: tensorboard_logs/glove6B100D_lstm/20250703-143643
Epoch 1/50


2025-07-03 14:36:45.319115: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8600
2025-07-03 14:36:45.987632: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x70f78cb6b100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-07-03 14:36:45.987657: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-07-03 14:36:45.990532: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-07-03 14:36:46.063261: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 33: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 36: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 37/50
Epoch 38/50
Epoch 38: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 41: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 42/50
Epoch 43/50
Epoch 43: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 46: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 

In [25]:
import tensorflow as tf
from keras.layers import Lambda

preprocessing_steps = Sequential([
    Lambda(lambda x: clean_text(x)),  # Remove URLs
    Lambda(lambda x: tokenizer.texts_to_sequences([x])[0]),  # Tokenize the text
    Lambda(lambda x: pad_sequences([x], maxlen=max_length-1, padding='pre')[0]),  # Pad the sequences
    Lambda(lambda x: tf.expand_dims(x, axis=0))  # Add batch dimension
])


In [56]:
def generate_output(result):
    return tokenizer.index_word.get(result, "")


In [55]:
from keras import layers

final_model = Sequential([
    preprocessing_steps,
    model,
    layers.Lambda(lambda x: tf.squeeze(x), name='squeeze_output_layer'),
    layers.Lambda(lambda x: tf.argmax(x).numpy(), name='argmax_output_layer'),
    layers.Lambda(lambda x: generate_output(x), name='generate_output_layer')
])

In [58]:
texts = ["First rule of the fight club", "Good morning", "Please", "I will make"]

for text in texts:
    text_copy = text
    for _ in range(8):
        result = final_model(text_copy)
        text_copy += " " + result
    print(f"Input: {text}\nGenerated: {text_copy}\n")
    print("-" * 50)
    

Input: First rule of the fight club
Generated: First rule of the fight club is you do not talk about fight club

--------------------------------------------------
Input: Good morning
Generated: Good morning i hope you re doing well today today

--------------------------------------------------
Input: Please
Generated: Please prioritize this task to meet the upcoming deadline

--------------------------------------------------
Input: I will make
Generated: I will make sure to get over the rest of the

--------------------------------------------------


In [59]:
import pickle
final_model.save("Model/glove6B100D_lstm.h5")
pickle.dump(tokenizer, open("Model/tokenizer.pkl", "wb"))

  saving_api.save_model(
