Loading Data

In [87]:
data_path = "Datasets/data.txt"

texts = []

with(open(data_path, "r", encoding="utf-8", errors="ignore") as f):
    for line in f:
        line = line.strip()
        if line:
            texts.append(line)

In [88]:
texts[:5], len(texts)  

(['I never thought I’d see you again after all these years.',
  'Life has strange ways of bringing people back together when least expected.',
  'The evidence doesn’t add up. The fingerprints on the weapon belong to someone who wasn’t even at the crime scene that night.',
  'We’ve tried every possible treatment, but his condition remains stable. The next few hours will be critical for his full recovery.',
  'Your mission is simple: retrieve the stolen data, avoid enemy surveillance, and ensure nobody knows you were ever there.'],
 432)

We can See a lot of noise of blank strings...So preprocessing Texts

In [89]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9?.!,']+", " ", text)
    text = re.sub(r"[()]", "", text)
    text = re.sub(r"\.{2,}", ".", text)
    text = re.sub(r"\,{2,}", ",", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [90]:
import random

texts = [clean_text(text) for text in texts]
random.seed(42)
random.shuffle(texts)

In [91]:
texts[:5]

['i ll notify you if there are any significant updates.',
 'sometimes to love someone, you got to be a stranger.',
 'i ll make sure to double check the details.',
 'hasta la vista, baby this mission is terminated.',
 'you re gonna need a bigger team if you want to win this battle.']

In [92]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = 5000, oov_token="<oov>")
tokenizer.fit_on_texts(texts)

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 1324


In [93]:
train_sequences = tokenizer.texts_to_sequences(texts)

In [94]:
train_sequences[:10]

[[5, 34, 305, 3, 22, 32, 26, 39, 211, 212],
 [97, 4, 111, 213, 3, 161, 4, 17, 7, 306],
 [5, 34, 76, 162, 4, 307, 214, 2, 163],
 [530, 531, 532, 308, 9, 215, 13, 533],
 [3, 38, 164, 43, 7, 127, 91, 22, 3, 128, 4, 309, 9, 534],
 [6, 66, 310, 7, 535, 112, 40, 113, 58],
 [2,
  536,
  8,
  216,
  217,
  537,
  67,
  538,
  539,
  22,
  6,
  51,
  11,
  540,
  6,
  19,
  541,
  16,
  542,
  10,
  218,
  311,
  39,
  312],
 [35, 59, 543, 24, 2, 92, 544],
 [41, 545, 11, 546, 4, 20, 10, 5, 165, 23, 21, 114, 77, 11],
 [547,
  19,
  17,
  313,
  98,
  16,
  2,
  314,
  15,
  548,
  22,
  60,
  78,
  549,
  4,
  550,
  36,
  2,
  219]]

In [95]:
def get_ngrams(sequences):
    ngrams = []
    for seq in sequences:
        for i in range(1, len(seq)):
            n_gram_seq = seq[:i + 1]
            ngrams.append(n_gram_seq)
    
    return ngrams

In [96]:
train_tokens = get_ngrams(train_sequences)

In [97]:
train_tokens[:10]

[[5, 34],
 [5, 34, 305],
 [5, 34, 305, 3],
 [5, 34, 305, 3, 22],
 [5, 34, 305, 3, 22, 32],
 [5, 34, 305, 3, 22, 32, 26],
 [5, 34, 305, 3, 22, 32, 26, 39],
 [5, 34, 305, 3, 22, 32, 26, 39, 211],
 [5, 34, 305, 3, 22, 32, 26, 39, 211, 212],
 [97, 4]]

In [98]:
max_length = max(len(seq) for seq in train_tokens)
max_length

24

In [99]:
# # 160 is a big number so we will keep it to 40
# max_length = 40

# train_tokens = [token for token in train_tokens if len(token) <= max_length]
# val_tokens = [token for token in val_tokens if len(token) <= max_length]
# test_tokens = [token for token in test_tokens if len(token) <= max_length]

# print(f"Final train size: {len(train_tokens)}")
# print(f"Final validation size: {len(val_tokens)}")
# print(f"Final test size: {len(test_tokens)}")

In [100]:
train_tokens = pad_sequences(train_tokens, maxlen=max_length, padding='pre')
print(f"Train tokens shape: {train_tokens.shape}")

Train tokens shape: (4237, 24)


In [101]:
train_tokens[:10]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   5,  34],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   5,  34, 305],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   5,  34, 305,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   5,  34, 305,   3,  22],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   5,  34, 305,   3,  22,  32],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   5,  34, 305,   3,  22,  32,  26],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   5,  34, 305,   3,  22,  32,  26,  39],
       [  0,   0,   0,   0,   0,   0,   0

In [102]:
import tensorflow as tf

X_train = train_tokens[:, :-1]
y_train = train_tokens[:, -1]
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (4237, 23), y_train shape: (4237,)


In [103]:
X_train[:5], y_train[:5]

(array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   5],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   5,  34],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   5,  34, 305],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   5,  34, 305,   3],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   5,  34, 305,   3,  22]], dtype=int32),
 array([ 34, 305,   3,  22,  32], dtype=int32))

Using GLOVE 6B 100D pretrained model as embedding layer

In [104]:
import numpy as np

# Set your file path
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coeffs

print("Found %s word vectors." % len(embedding_index))

Found 400000 word vectors.


In [105]:
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        # Random initialization for OOV words (optional)
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))


In [106]:
embedding_matrix.shape, embedding_matrix[:5]

((1324, 100),
 array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00, 

In [107]:
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from keras.models import Sequential
from functools import partial

embedding_dim = 100
Embedding = partial(Embedding, weights=[embedding_matrix], trainable=False)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length-1, name='embedding_layer'),
    Bidirectional(LSTM(32, name='bidirectional_lstm')),
    Dense(64, activation='relu', name='dense_layer'),
    BatchNormalization(),
    Dropout(0.3, name='dropout_layer'),
    Dense(vocab_size, activation='softmax', name='output_layer')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding  (None, 23, 100)           132400    
 )                                                               
                                                                 
 bidirectional_3 (Bidirecti  (None, 64)                34048     
 onal)                                                           
                                                                 
 dense_layer (Dense)         (None, 64)                4160      
                                                                 
 batch_normalization_3 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dropout_layer (Dropout)     (None, 64)                0         
                                                      

In [108]:
from keras.callbacks import ReduceLROnPlateau
from helper_functions import create_tensorboard_callback

lr_reduce = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=1, verbose=1)

In [109]:
history = model.fit(X_train,
                    y_train,
                    epochs=50,
                    batch_size=16,
                    callbacks=[create_tensorboard_callback("tensorboard_logs", "glove6B100D_lstm"), lr_reduce])

Saving TensorBoard log files to: tensorboard_logs/glove6B100D_lstm/20250616-025808
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 38: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 45: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 50: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.


In [110]:
import tensorflow as tf
from keras.layers import Lambda

preprocessing_steps = Sequential([
    Lambda(lambda x: clean_text(x)),  # Remove URLs
    Lambda(lambda x: tokenizer.texts_to_sequences([x])[0]),  # Tokenize the text
    Lambda(lambda x: pad_sequences([x], maxlen=max_length-1, padding='pre')[0]),  # Pad the sequences
    Lambda(lambda x: tf.expand_dims(x, axis=0))  # Add batch dimension
])


In [111]:
final_model = Sequential([
    preprocessing_steps,
    model
])

In [122]:
texts = ["First rule of the fight club", "Good morning", "Please", "I will make him"]

for text in texts:
    text_copy = text
    for _ in range(8):
        result = final_model(text_copy)
        next_word_index = np.argmax(result[0])
        next_word = tokenizer.index_word.get(next_word_index, "")
        if not next_word:
            break
        text_copy += " " + next_word
    print(f"Input: {text}\nGenerated: {text_copy}\n")
    print("-" * 50)
    

Input: First rule of the fight club
Generated: First rule of the fight club is you do not talk about fight club

--------------------------------------------------
Input: Good morning
Generated: Good morning i hope your lives extraordinary lives extraordinary lives

--------------------------------------------------
Input: Please
Generated: Please let me know if you foresee any potential

--------------------------------------------------
Input: I will make him
Generated: I will make him to offer but blood toil tears and sweat

--------------------------------------------------


In [123]:
model.save("Model/glove6B100D_lstm.h5")

  saving_api.save_model(
