In [None]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
import tensorflow as tf

from numpy.random import seed
tf.random.set_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
all_headlines = []
cwd = '/content/drive/My Drive/Colab Notebooks/input/'  # Get the current working directory (cwd)
#files = os.listdir(cwd)  # Get all the files in that directory
#print("Files in %r: %s" % (cwd, files))

for filename in os.listdir(cwd):
    if 'Articles' in filename:
        article_df = pd.read_csv(cwd + filename)
        all_headlines.extend(list(article_df.headline.values))
        break
         

all_headlines[:10]

['N.F.L. vs. Politics Has Been Battle All Season Long',
 'Voice. Vice. Veracity.',
 'A Stand-Up’s Downward Slide',
 'New York Today: A Groundhog Has Her Day',
 'A Swimmer’s Communion With the Ocean',
 'Trail Activity',
 'Super Bowl',
 'Trump’s Mexican Shakedown',
 'Pence’s Presidential Pet',
 'Fruit of a Poison Tree']

In [None]:
#Dataset preparation
#Dataset cleaning
#In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations and lower casing all the words.

def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['nfl vs politics has been battle all season long',
 'voice vice veracity',
 'a standups downward slide',
 'new york today a groundhog has her day',
 'a swimmers communion with the ocean',
 'trail activity',
 'super bowl',
 'trumps mexican shakedown',
 'pences presidential pet',
 'fruit of a poison tree']

In [None]:
#Generating Sequence of N-gram Tokens
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words
inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[661, 118],
 [661, 118, 73],
 [661, 118, 73, 74],
 [661, 118, 73, 74, 662],
 [661, 118, 73, 74, 662, 663],
 [661, 118, 73, 74, 662, 663, 64],
 [661, 118, 73, 74, 662, 663, 64, 30],
 [661, 118, 73, 74, 662, 663, 64, 30, 211],
 [212, 664],
 [212, 664, 665]]

In [None]:
#Padding the Sequences and obtain Variables : Predictors and Target
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
#LSTMs for Text Generation
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 16, 10)            22890     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_2 (Dense)              (None, 2289)              231189    
Total params: 298,479
Trainable params: 298,479
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Lets train our model now
model.fit(predictors, label, epochs=100, verbose=2)

Epoch 1/100
 - 3s - loss: 7.3612
Epoch 2/100
 - 3s - loss: 6.8417
Epoch 3/100
 - 3s - loss: 6.7173
Epoch 4/100
 - 3s - loss: 6.6330
Epoch 5/100
 - 3s - loss: 6.5459
Epoch 6/100
 - 3s - loss: 6.4481
Epoch 7/100
 - 3s - loss: 6.3421
Epoch 8/100
 - 3s - loss: 6.2283
Epoch 9/100
 - 3s - loss: 6.1202
Epoch 10/100
 - 3s - loss: 6.0134
Epoch 11/100
 - 3s - loss: 5.9201
Epoch 12/100
 - 3s - loss: 5.8292
Epoch 13/100
 - 3s - loss: 5.7401
Epoch 14/100
 - 3s - loss: 5.6484
Epoch 15/100
 - 3s - loss: 5.5573
Epoch 16/100
 - 3s - loss: 5.4686
Epoch 17/100
 - 3s - loss: 5.3758
Epoch 18/100
 - 3s - loss: 5.2838
Epoch 19/100
 - 3s - loss: 5.1927
Epoch 20/100
 - 3s - loss: 5.1008
Epoch 21/100
 - 3s - loss: 5.0120
Epoch 22/100
 - 3s - loss: 4.9253
Epoch 23/100
 - 3s - loss: 4.8371
Epoch 24/100
 - 3s - loss: 4.7550
Epoch 25/100
 - 3s - loss: 4.6685
Epoch 26/100
 - 3s - loss: 4.5841
Epoch 27/100
 - 3s - loss: 4.5037
Epoch 28/100
 - 3s - loss: 4.4201
Epoch 29/100
 - 3s - loss: 4.3401
Epoch 30/100
 - 3s - lo

<keras.callbacks.callbacks.History at 0x7f4b780534e0>

In [None]:
#Generating the text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("super bowl", 20, model, max_sequence_len))
print (generate_text("trail", 20, model, max_sequence_len))
print (generate_text("trumps ", 20, model, max_sequence_len))

Super Bowl Is Very Concerned Out A New Tax Leaves To Investors Odd Man Of Freely First First Help From The Common
Trail Disclosure Wall Street Zen And Investors Tower Bridge Rick Scorn Off Side Of Shrugs Kind College How Much Does It
Trumps  Acrostic Rivals Wear Robes Flexibility Its I Times Are The Hell Half Premiere Door Areas Off Liberal Of Kennedy Them
