<a href="https://colab.research.google.com/github/Paras-Tiwari-18/NextWordPredector/blob/main/NextWordPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('webtext')
nltk.download('punkt')

[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from nltk.corpus import brown, reuters, webtext
all_words = []
for fileid in webtext.fileids():
    all_words.extend(webtext.words(fileid))

print("Total words before cleaning:", len(all_words))

Total words before cleaning: 396733


In [3]:
import re
unwanted_words = set([
    "19teens", "fuck", "shit", "bitch", "www", "com", "http", "te", "o", "k", "e", "n", "start", "shadowbots"
])
cleaned_words = []
for w in all_words:
    w = w.lower().strip()
    if w in unwanted_words:
        continue
    if re.match(r"^[^a-z]+$", w):
        continue
    if len(w) == 1:
        continue
    cleaned_words.append(w)

print("Total words after cleaning:", len(cleaned_words))

Total words after cleaning: 282146


In [4]:
max_tokens = 40000
token_subset = cleaned_words[:max_tokens]
print(f"Using {len(token_subset)} tokens for training")

Using 40000 tokens for training


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
text = " ".join(token_subset)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print("Total unique words:", total_words)

token_list = tokenizer.texts_to_sequences([text])[0]


Total unique words: 3923


In [6]:
input_sequences = []
max_len = 15

for i in range(2, len(token_list) + 1):
    start = max(0, i - max_len)
    n_gram_seq = token_list[start:i]
    input_sequences.append(n_gram_seq)

print("Number of sequences:", len(input_sequences))

max_sequence_len = max(len(seq) for seq in input_sequences)
print("Max sequence length:", max_sequence_len)



Number of sequences: 40047
Max sequence length: 15


In [7]:
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

X = input_sequences[:, :-1]
y = input_sequences[:, -1]


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape)
print("Validation samples:", X_val.shape)


Training samples: (32037, 14)
Validation samples: (8010, 14)


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential([
    Embedding(input_dim=total_words, output_dim=128, input_length=max_sequence_len - 1),

    Bidirectional(LSTM(256, return_sequences=True)),
    Dropout(0.3),

    Bidirectional(LSTM(192, return_sequences=True)),
    Dropout(0.3),

    Bidirectional(LSTM(128)),
    Dropout(0.3),

    Dense(256, activation='relu'),
    Dropout(0.3),

    Dense(total_words, activation='softmax')
])
model.build(input_shape=(None, max_sequence_len - 1))
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()






In [10]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint(
    'nextword_best_model.keras',
    monitor='accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

history = model.fit(
    X_train, y_train,
    epochs=350,
    batch_size=128,
    validation_data=(X_val, y_val),
    callbacks=[checkpoint]
)


Epoch 1/350
[1m250/251[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.0224 - loss: 7.0463
Epoch 1: accuracy improved from -inf to 0.02485, saving model to nextword_best_model.keras
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 27ms/step - accuracy: 0.0225 - loss: 7.0442 - val_accuracy: 0.0266 - val_loss: 6.6686
Epoch 2/350
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0266 - loss: 6.5641
Epoch 2: accuracy improved from 0.02485 to 0.02978, saving model to nextword_best_model.keras
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.0266 - loss: 6.5641 - val_accuracy: 0.0363 - val_loss: 6.6329
Epoch 3/350
[1m250/251[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.0355 - loss: 6.4508
Epoch 3: accuracy improved from 0.02978 to 0.03699, saving model to nextword_best_model.keras
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [13]:
model.save("nextword_model_v1.h5")

import pickle
with open("tokenizer_v1.pkl", "wb") as f:
    pickle.dump(tokenizer, f)




In [14]:
import tensorflow as tf
from tensorflow.keras.models import load_model

tf.config.run_functions_eagerly(True)

model = load_model("nextword_model_v1.h5")

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history = model.fit(X_train, y_train, epochs=2,batch_size=256, validation_data=(X_val, y_val))




Epoch 1/2




[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 248ms/step - accuracy: 0.8209 - loss: 0.6376 - val_accuracy: 0.0607 - val_loss: 30.1849
Epoch 2/2
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 255ms/step - accuracy: 0.8270 - loss: 0.6056 - val_accuracy: 0.0603 - val_loss: 30.8366


In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
model = load_model("nextword_model_v1.h5")
with open("tokenizer_v1.pkl", "rb") as f:
    tokenizer = pickle.load(f)




In [17]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
def sample(preds, temperature=0.6):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)
def predict_next_word_safe(seed_text, tokenizer, max_sequence_len, model, temperature=0.8):
    banned_words = {
        "19teens", "fuck", "shit", "bitch", "www", "com", "http",
        "te", "o", "k", "e", "n", "start", "shadowbots"
    }
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)[0]
    while True:
        predicted_index = sample(predicted_probs, temperature)
        if predicted_index == 0:
            continue
        word = tokenizer.index_word.get(predicted_index, None)
        if word and word not in banned_words and len(word) > 1:
            return word
test_sentences = [
    "I really like",
    "He is not",
    "They are going",
    "We have to",
    "I was just",
    "You can try",
    "It feels like",
    "Don’t forget to",
    "Can you make",
    "How can I",
    "Why don’t you",
    "I should probably",
    "Everything is so",
    "Do you think",
    "I hope you",
    "I am trying to",
    "You must be",
    "We should go",
    "Please make sure to",
    "It is very",
    "I think you",
    "You will need to"
]
for seed in test_sentences:
    next_word = predict_next_word_safe(seed, tokenizer, max_sequence_len, model, temperature=0.8)
    print(f"Seed: {seed} → Next word: {next_word}")


Seed: I really like → Next word: item
Seed: He is not → Next word: blocked
Seed: They are going → Next word: backward
Seed: We have to → Next word: enter
Seed: I was just → Next word: shouldn
Seed: You can try → Next word: for
Seed: It feels like → Next word: blank
Seed: Don’t forget to → Next word: load
Seed: Can you make → Next word: keyboard
Seed: How can I → Next word: load
Seed: Why don’t you → Next word: disappear
Seed: I should probably → Next word: specific
Seed: Everything is so → Next word: listed
Seed: Do you think → Next word: open
Seed: I hope you → Next word: click
Seed: I am trying to → Next word: detect
Seed: You must be → Next word: default
Seed: We should go → Next word: back
Seed: Please make sure to → Next word: licensing
Seed: It is very → Next word: slow
Seed: I think you → Next word: render
Seed: You will need to → Next word: load


In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model

tf.config.run_functions_eagerly(True)

model = load_model("nextword_model_v1.h5")

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history = model.fit(X_train, y_train, epochs=1, validation_data=(X_val, y_val))
