In [1]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


# Functions

In [2]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

In [3]:
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

In [4]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

In [5]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [6]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

# Data

In [7]:
curr_dir = '../input/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines[:10]

['The Opioid Crisis Foretold',
 'The Business Deals That Could Imperil Trump',
 'Adapting to American Decline',
 'The Republicans’ Big Senate Mess',
 'States Are Doing What Scott Pruitt Won’t',
 'Fake Pearls, Real Heart',
 'Fear Beyond Starbucks',
 'Variety: Puns and Anagrams',
 'E.P.A. Chief’s Ethics Woes Have Echoes in His Past',
 'Where Facebook Rumors Fuel Thirst for Revenge']


In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations and lower casing all the words. 

In [8]:


corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['the opioid crisis foretold',
 'the business deals that could imperil trump',
 'adapting to american decline',
 'the republicans big senate mess',
 'states are doing what scott pruitt wont',
 'fake pearls real heart',
 'fear beyond starbucks',
 'variety puns and anagrams',
 'epa chiefs ethics woes have echoes in his past',
 'where facebook rumors fuel thirst for revenge']

### Generating Sequence of N-gram Tokens

Language modelling requires a sequence input data, as given a sequence (of words/tokens) the aim is the predict next word/token.  

The next step is Tokenization. Tokenization is a process of extracting tokens (terms / words) from a corpus. Python’s library Keras has inbuilt model for tokenization which can be used to obtain the tokens and their index in the corpus. After this step, every text document in the dataset is converted into sequence of tokens. 


In [9]:
tokenizer = Tokenizer()


inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[1, 708],
 [1, 708, 251],
 [1, 708, 251, 369],
 [1, 370],
 [1, 370, 709],
 [1, 370, 709, 29],
 [1, 370, 709, 29, 136],
 [1, 370, 709, 29, 136, 710],
 [1, 370, 709, 29, 136, 710, 10],
 [711, 5]]

In the above output [30, 507], [30, 507, 11], [30, 507, 11, 1] and so on represents the ngram phrases generated from the input data. where every integer corresponds to the index of a particular word in the complete vocabulary of words present in the text. For example

**Headline:** i stand  with the shedevils  
**Ngrams:** | **Sequence of Tokens**

<table>
<tr><td>Ngram </td><td> Sequence of Tokens</td></tr>
<tr> <td>i stand </td><td> [30, 507] </td></tr>
<tr> <td>i stand with </td><td> [30, 507, 11] </td></tr>
<tr> <td>i stand with the </td><td> [30, 507, 11, 1] </td></tr>
<tr> <td>i stand with the shedevils </td><td> [30, 507, 11, 1, 975] </td></tr>
</table>



### Padding the Sequences and obtain Variables : Predictors and Target

Now that we have generated a data-set which contains sequence of tokens, it is possible that different sequences have different lengths. Before starting training the model, we need to pad the sequences and make their lengths equal. We can use pad_sequence function of Kears for this purpose. To input this data into a learning model, we need to create predictors and label. We will create N-grams sequence as predictors and the next word of the N-gram as label. For example:


Headline:  they are learning data science

<table>
<tr><td>PREDICTORS </td> <td>           LABEL </td></tr>
<tr><td>they                   </td> <td>  are</td></tr>
<tr><td>they are               </td> <td>  learning</td></tr>
<tr><td>they are learning      </td> <td>  data</td></tr>
<tr><td>they are learning data </td> <td>  science</td></tr>
</table>

In [10]:


predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)



# The model

In [11]:


model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 16, 10)            24450     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_1 (Dense)              (None, 2445)              246945    
Total params: 315,795
Trainable params: 315,795
Non-trainable params: 0
_________________________________________________________________


Lets train our model now

In [12]:
model.fit(predictors, label, epochs=100, verbose=2)

Epoch 1/100
 - 8s - loss: 7.4037
Epoch 2/100
 - 5s - loss: 6.8892
Epoch 3/100
 - 4s - loss: 6.7654
Epoch 4/100
 - 4s - loss: 6.6761
Epoch 5/100
 - 4s - loss: 6.5786
Epoch 6/100
 - 4s - loss: 6.4560
Epoch 7/100
 - 4s - loss: 6.3166
Epoch 8/100
 - 4s - loss: 6.1771
Epoch 9/100
 - 4s - loss: 6.0437
Epoch 10/100
 - 5s - loss: 5.9210
Epoch 11/100
 - 4s - loss: 5.7991
Epoch 12/100
 - 4s - loss: 5.6767
Epoch 13/100
 - 5s - loss: 5.5574
Epoch 14/100
 - 4s - loss: 5.4394
Epoch 15/100
 - 4s - loss: 5.3243
Epoch 16/100
 - 4s - loss: 5.2129
Epoch 17/100
 - 4s - loss: 5.1028
Epoch 18/100
 - 5s - loss: 4.9948
Epoch 19/100
 - 4s - loss: 4.8900
Epoch 20/100
 - 4s - loss: 4.7864
Epoch 21/100
 - 4s - loss: 4.6860
Epoch 22/100
 - 4s - loss: 4.5860
Epoch 23/100
 - 4s - loss: 4.4915
Epoch 24/100
 - 4s - loss: 4.3967
Epoch 25/100
 - 4s - loss: 4.3040
Epoch 26/100
 - 5s - loss: 4.2127
Epoch 27/100
 - 4s - loss: 4.1254
Epoch 28/100
 - 5s - loss: 4.0391
Epoch 29/100
 - 4s - loss: 3.9515
Epoch 30/100
 - 4s - lo

<keras.callbacks.History at 0x7f020471e390>



Great, our model architecture is now ready and we can train it using our data. Next lets write the function to predict the next word based on the input words (or seed text). We will first tokenize the seed text, pad the sequences and pass into the trained model to get predicted word. The multiple predicted words can be appended together to get predicted sequence.


In [13]:
print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("united states", 10, model, max_sequence_len))
print (generate_text("united states", 15, model, max_sequence_len))

United States Shouldnt Sit Still An Atlantic
United States Shouldnt Sit Still An Atlantic Imperative Agencies Out To 2426
United States Shouldnt Sit Still An Atlantic Imperative Agencies Out To 2426 And 9 Flights Of Stairs


In [14]:
print (generate_text("president trump", 3, model, max_sequence_len))
print (generate_text("president trump", 4, model, max_sequence_len))
print (generate_text("president trump", 5, model, max_sequence_len))
print (generate_text("president trump", 8, model, max_sequence_len))


President Trump Vs Congress Bird
President Trump Vs Congress Bird Moving
President Trump Vs Congress Bird Moving One
President Trump Vs Congress Bird Moving One About A Fairy


In [15]:
print (generate_text("joe biden", 3, model, max_sequence_len))
print (generate_text("joe biden", 4, model, max_sequence_len))
print (generate_text("joe biden", 5, model, max_sequence_len))
print (generate_text("joe biden", 8, model, max_sequence_len))

Joe Biden Infuses The Constitution
Joe Biden Infuses The Constitution Invaded
Joe Biden Infuses The Constitution Invaded Canada
Joe Biden Infuses The Constitution Invaded Canada Unique Memorial Award


In [16]:
print (generate_text("india and china", 3, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("india and china", 5, model, max_sequence_len))
print (generate_text("india and china", 8, model, max_sequence_len))

India And China Deal And The
India And China Deal And The Young
India And China Deal And The Young Think
India And China Deal And The Young Think Again To It


In [17]:
print (generate_text("european union", 3, model, max_sequence_len))
print (generate_text("european union", 4, model, max_sequence_len))
print (generate_text("european union", 5, model, max_sequence_len))
print (generate_text("european union", 8, model, max_sequence_len))

European Union Infuses The Constitution
European Union Infuses The Constitution Invaded
European Union Infuses The Constitution Invaded Canada
European Union Infuses The Constitution Invaded Canada Unique Memorial Award
