**Importing modules**

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer #type:ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences #type: ignore 
from tensorflow.keras.callbacks import EarlyStopping # type: ignore
import tensorflow.keras.utils as ku 
import pandas as pd 
import numpy as np
import string
# from model import create_model #type: ignore
import warnings
import pickle
warnings.filterwarnings('ignore')


**Loading dataset**

In [2]:
data=pd.read_csv("Data\india-news-headlines.csv")
data=data['headline_text']

headlines=[h for h in data ]

In [3]:
def clean_data(text):
    text=text.lower()
    text=''.join([i for i in text if i not in string.punctuation]) 
    text=text.encode('utf8').decode('ascii','ignore')
    return text 

In [4]:
text=[clean_data(text) for text in headlines]
len(text)

3876557

**Tokenizing and creating a sequence of tokens**

In [5]:
tokenizer=Tokenizer() 
def generate_sequences(text):
    
    # Tokenizing the text
    tokenizer.fit_on_texts(text)
    total_words=len(tokenizer.word_index)+1
    
    # Generating sequences 
    sequences=[]
    for l in text:
        token_list=tokenizer.texts_to_sequences([l])[0]
        for i in range(1,len(token_list)):
            n_gram_sequence=token_list[:i+1]
            sequences.append(n_gram_sequence)
    
    
    max_sequence_len=max([len(x) for x in sequences])
    sequences=np.array(pad_sequences(sequences,maxlen=max_sequence_len,padding='pre')) 
    
    predictors,label=sequences[:,:-1],sequences[:,-1]
    label=ku.to_categorical(label,num_classes=total_words)
    
    return predictors,label,max_sequence_len,total_words
    

In [6]:
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [7]:
predictors,label,max_sequence_len,total_words=generate_sequences(text[:21000])

In [8]:
print(max_sequence_len)
print(total_words)

17
19177


**Building the model**

In [9]:
from tensorflow.keras.models import Sequential #type: ignore
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense,Dropout #type: ignore 


input_len=max_sequence_len-1
    
# Setting up the model 
    
model=Sequential()
    
# Embedding layer
model.add(Embedding(total_words,32,input_length=input_len))
    
# RNN Layer
model.add(SimpleRNN(256))
    
# Dropout Layer
model.add(Dropout(0.2))
    
# Output Layer 
model.add(Dense(total_words,activation='softmax')) 
    
model.build(input_shape=(None, input_len))
model.compile(loss='categorical_crossentropy',optimizer='adam')    
model.summary()


**Setting up the callbacks**

In [10]:
from tensorflow.keras.callbacks import EarlyStopping #type: ignore 
early_stopping=EarlyStopping(
        monitor='training_loss',
        patience=5,
        restore_best_weights=True
    )

**Training the model**


In [11]:
# hist=model.fit(predictors,label,epochs=200,batch_size=64,callbacks=[early_stopping],verbose=1)

In [16]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [18]:
print(generate_text("maybe",15,model,max_sequence_len))

Maybe Stocks Conman Safaris Ticketholders Moser Sister Greets Safaris Sonia Philosopher Demystifying Raichur Canon Chattopadhyay Hudson
