In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/english-sentences-30000/english_sentences_AIproject.xlsx


In [2]:
# import necessary libraries and moduels
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



In [3]:
# import the dataset
df=pd.read_excel("/kaggle/input/english-sentences-30000/english_sentences_AIproject.xlsx")

In [4]:
print(df.head(5))

   Sr No                                          Sentences
0      1                             I have to go to sleep.
1      2  Today is June 18th and it is my sister's birth...
2      3                        I am going to turn 20 soon.
3      4              You should never share your password.
4      5                               I will be back soon.


In [5]:
# Convert into list of sentences
sentences = df['Sentences'].dropna().tolist()

In [6]:
# Create an instance of Tokenizer class
tokenizer = Tokenizer()

#Fit the tokenizer on the list of Sentences
tokenizer.fit_on_texts(sentences)

#Calculate the total number of words in the dataset
total_words = len(tokenizer.word_index) + 1
input_sequences = []

In [7]:
for line in sentences:
    # Skip empty lines
    if pd.notna(line):
        
        # Tokenize the current line into list of integer sequences
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            
            # Create n-gram sequence 
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

In [8]:
# Pad the sequence to ensure consistent length of each sequence
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [9]:
print(input_sequences)

[[   0    0    0 ...    0    5   13]
 [   0    0    0 ...    5   13    2]
 [   0    0    0 ...   13    2   43]
 ...
 [   0    0    0 ...   10 1785   11]
 [   0    0    0 ... 1785   11    1]
 [   0    0    0 ...   11    1   87]]


In [10]:
# Extract input sequences for model training
# X contains all sequences except the last element
X = input_sequences[:, :-1]

# Extract the target values for each sequence
# y contains the last element of each sequence
y = input_sequences[:, -1]

In [11]:
# Convert to One Hot Encoding Format
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [12]:
# Create a Sequential model
model = Sequential()

# Add an Embedding layer with vocabulary size, embedding dimension, and input length
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))

# Add an LSTM layer with 150 units
model.add(LSTM(150))

# Add a Dense layer with the number of total words in the vocabulary and softmax activation
model.add(Dense(total_words, activation='softmax'))

# Print the summary of the model architecture
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 68, 100)           1167700   
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 11677)             1763227   
                                                                 
Total params: 3,081,527
Trainable params: 3,081,527
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model on the input and output
model.fit(X, y, epochs=100, verbose=1)

In [None]:
# Create a function to predict the next words
def predict(seed_text,next_words):
    for _ in range(next_words):
        token_list=tokenizer.texts_to_sequences([seed_text])[0]
        
        # Pad the sequence to match the input length of the model
        token_list=pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        
        # Predict the next word using the trained model
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word=""
        
        # Find the word corresponding to the predicted index in the vocabulary
        for word, index in tokenizer.word_index.items():
            if index==predicted:
                output_word=word
                break
        seed_text+=" "+output_word
        
    print(seed_text)

In [None]:
# Get user input for the seed text
seed_text=input("Enter your statement: \n")

# Specify the number of words to predict
next_words=2
predict(seed_text,next_words)

In [None]:
model.save('predictormodel_kaggle.h5')