In [1]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


In [2]:
curr_dir = '/mnt/data/shreya/textgeneration/data_new_york_times/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines[:10]

['Rhythm of the Streets: ‘We’re Warrior Women, and Yes, We Can Play’',
 'As Deficit Grows, Congress Keeps Spending',
 'Lesson in Select Bus Service',
 'Here’s the Real State of the Union',
 'Good Riddance to Chief Wahoo',
 'In South Africa, Facing ‘Day Zero’ With No Water',
 'How Trump’s Critics Should Respond',
 'Unknown',
 'A Republican Stalwart Sets Out on a Quest to Unseat Cuomo as Governor',
 'Unknown']

In [3]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['rhythm of the streets were warrior women and yes we can play',
 'as deficit grows congress keeps spending',
 'lesson in select bus service',
 'heres the real state of the union',
 'good riddance to chief wahoo',
 'in south africa facing day zero with no water',
 'how trumps critics should respond',
 'unknown',
 'a republican stalwart sets out on a quest to unseat cuomo as governor',
 'unknown']

In [4]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[754, 4],
 [754, 4, 1],
 [754, 4, 1, 229],
 [754, 4, 1, 229, 162],
 [754, 4, 1, 229, 162, 755],
 [754, 4, 1, 229, 162, 755, 85],
 [754, 4, 1, 229, 162, 755, 85, 7],
 [754, 4, 1, 229, 162, 755, 85, 7, 163],
 [754, 4, 1, 229, 162, 755, 85, 7, 163, 39],
 [754, 4, 1, 229, 162, 755, 85, 7, 163, 39, 49]]

In [12]:
total_words
len (corpus)

905

In [5]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)


In [6]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 16, 10)            25130     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_1 (Dense)              (None, 2513)              253813    
Total params: 323,343
Trainable params: 323,343
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(predictors, label, epochs=100, verbose=2)


Epoch 1/100
 - 7s - loss: 7.4745
Epoch 2/100
 - 5s - loss: 6.9596
Epoch 3/100
 - 5s - loss: 6.8357
Epoch 4/100
 - 5s - loss: 6.7396
Epoch 5/100
 - 5s - loss: 6.6327
Epoch 6/100
 - 5s - loss: 6.5019
Epoch 7/100
 - 5s - loss: 6.3774
Epoch 8/100
 - 5s - loss: 6.2492
Epoch 9/100
 - 5s - loss: 6.1180
Epoch 10/100
 - 5s - loss: 5.9959
Epoch 11/100
 - 5s - loss: 5.8791
Epoch 12/100
 - 5s - loss: 5.7673
Epoch 13/100
 - 5s - loss: 5.6643
Epoch 14/100
 - 5s - loss: 5.5663
Epoch 15/100
 - 5s - loss: 5.4747
Epoch 16/100
 - 5s - loss: 5.3862
Epoch 17/100
 - 5s - loss: 5.3005
Epoch 18/100
 - 5s - loss: 5.2173
Epoch 19/100
 - 5s - loss: 5.1355
Epoch 20/100
 - 5s - loss: 5.0546
Epoch 21/100
 - 5s - loss: 4.9747
Epoch 22/100
 - 5s - loss: 4.8968
Epoch 23/100
 - 5s - loss: 4.8175
Epoch 24/100
 - 5s - loss: 4.7423
Epoch 25/100
 - 5s - loss: 4.6694
Epoch 26/100
 - 5s - loss: 4.5917
Epoch 27/100
 - 5s - loss: 4.5168
Epoch 28/100
 - 5s - loss: 4.4438
Epoch 29/100
 - 5s - loss: 4.3740
Epoch 30/100
 - 5s - lo

<keras.callbacks.History at 0x7feba2eb3fd0>

In [13]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()


In [16]:
print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("preident trump", 4, model, max_sequence_len))
print (generate_text("donald trump", 4, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("new york", 4, model, max_sequence_len))
print (generate_text("science and technology", 5, model, max_sequence_len))
print (generate_text("Corruption ", 3, model, max_sequence_len))

United States Awards Stages Into Soapboxes For
Preident Trump You The Shutdown And
Donald Trump Scolded But Police Levels
India And China Washington You Is The
New York Is Next On The
Science And Technology Hope To Cash Interview On
Corruption  The New Teaching
