In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import os
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

In [3]:

# Function to read and split file into sentences
def file_to_sentence_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Splitting the text into sentences
    sentences = [sentence.strip() for sentence in re.split(
        r'(?<=[.!?])\s+', text) if sentence.strip()]

    return sentences

In [4]:
file_path= '/content/drive/MyDrive/pizza.txt'
# Load text data
text_data = file_to_sentence_list(file_path)


In [7]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1
print(total_words)

687


In [8]:
# Create input sequences
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [None]:
#maximum length of input sequence
max_sequence_len = max([len(seq) for seq in input_sequences])
print(max_sequence_len)

40


In [None]:

# Pad sequences and split into predictors and labels
input_sequences = np.array(pad_sequences( input_sequences, maxlen=max_sequence_len, padding='pre'))
x = input_sequences[:, :-1]
print(x.shape)

y=input_sequences[:, -1]
print(y.shape)
# Convert target data to one-hot encoding
y = to_categorical(y, num_classes=total_words)
print(y.shape)
print(y)


(1628, 39)
(1628,)
(1628, 687)
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [None]:

# Define the model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))

# LSTM layer with Dropout
model.add(LSTM(130, return_sequences=True))  # Return sequences for better feature learning
model.add(Dropout(0.2))  # 20% Dropout to prevent overfitting
model.add(LSTM(100))  # Second LSTM layer for deeper learning

# Output layer
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# EarlyStopping with higher patience
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)





In [10]:
# Train the model
model.fit(x, y, epochs=500, verbose=1)


In [None]:

model = load_model("/content/drive/MyDrive/Colab Notebooks/next_word_model.keras")  # Load the saved model


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

# Load training data from the file
file_path = "/content/drive/MyDrive/pizza.txt"

with open(file_path, "r", encoding="utf-8") as file:
    training_texts = file.readlines()  # Read all lines into a list

training_texts = [line.strip() for line in training_texts]  # Remove extra spaces

# Recreate tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_texts)

# Save tokenizer to Google Drive
with open('/content/drive/MyDrive/Colab Notebooks/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

print("Tokenizer saved successfully!")


Tokenizer saved successfully!


In [None]:

def predict_next_words(model, tokenizer, input_text, num_words=5, max_sequence_length=max_sequence_len):
    for _ in range(num_words):
        # Convert input text to a sequence
        sequence = tokenizer.texts_to_sequences([input_text])

        # Pad the sequence to match the model's expected input size
        padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length-1, padding='pre')

        # Predict the next word
        predicted_probs = model.predict(padded_sequence, verbose=0)

        # Get the index of the word with the highest probability
        predicted_index = np.argmax(predicted_probs, axis=-1)[0]

        # Convert index to word
        word_map = {index: word for word, index in tokenizer.word_index.items()}
        predicted_word = word_map.get(predicted_index, "<UNK>")  # Handle unknown words

        # Append the predicted word to the input text
        input_text += " " + predicted_word

    return input_text


In [None]:
input_text = "Pizza"
predicted_sentence = predict_next_words(model, tokenizer, input_text, num_words=15, max_sequence_length=max_sequence_len)
print("Generated Sentence:", predicted_sentence)


Generated Sentence: Pizza has become a symbol of comfort happiness and celebration and its iconic triangular slices have


In [None]:
input_text = "Technology"
predicted_sentence = predict_next_words(model, tokenizer, input_text, num_words=15, max_sequence_length=max_sequence_len)
print("Generated Sentence:", predicted_sentence)


Generated Sentence: Technology will play a significant role in shaping the future of pizza making lies and delivery


In [None]:
input_text = "India"
predicted_sentence = predict_next_words(model, tokenizer, input_text, num_words=30, max_sequence_length=max_sequence_len)
print("Generated Sentence:", predicted_sentence)


Generated Sentence: India is much more than a delicious dish—it is a culinary phenomenon that has captured the hearts and palates of people around the world of process of the beloved fluffy crust
