In [1]:
import tensorflow as tf
from tensorflow import keras 

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

import numpy as np
np.random.seed(7)
tf.random.set_seed(7) # analogue of set_random_seed(seed_value) in TF 1.x
import random
random.seed(7)
tf.random.uniform([1], seed=1)

import pandas as pd
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# Functions

In [2]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

In [3]:
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

In [4]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

In [5]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [6]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

# Data

In [7]:
curr_dir = '../input/nyt-comments/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines[:10]

['N.F.L. vs. Politics Has Been Battle All Season Long',
 'Voice. Vice. Veracity.',
 'A Stand-Up’s Downward Slide',
 'New York Today: A Groundhog Has Her Day',
 'A Swimmer’s Communion With the Ocean',
 'Trail Activity',
 'Super Bowl',
 'Trump’s Mexican Shakedown',
 'Pence’s Presidential Pet',
 'Fruit of a Poison Tree']

In [8]:
for filename in os.listdir(curr_dir):
    print(filename)

CommentsFeb2018.csv
ArticlesFeb2017.csv
CommentsApril2018.csv
ArticlesJan2017.csv
ArticlesMay2017.csv
CommentsJan2017.csv
CommentsMarch2017.csv
CommentsMay2017.csv
CommentsMarch2018.csv
CommentsApril2017.csv
ArticlesMarch2017.csv
ArticlesApril2017.csv
CommentsFeb2017.csv
ArticlesJan2018.csv
ArticlesFeb2018.csv
ArticlesMarch2018.csv
CommentsJan2018.csv
ArticlesApril2018.csv



In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations and lower casing all the words. 

In [9]:


corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['nfl vs politics has been battle all season long',
 'voice vice veracity',
 'a standups downward slide',
 'new york today a groundhog has her day',
 'a swimmers communion with the ocean',
 'trail activity',
 'super bowl',
 'trumps mexican shakedown',
 'pences presidential pet',
 'fruit of a poison tree']

### Generating Sequence of N-gram Tokens

Language modelling requires a sequence input data, as given a sequence (of words/tokens) the aim is the predict next word/token.  

The next step is Tokenization. Tokenization is a process of extracting tokens (terms / words) from a corpus. Python’s library Keras has inbuilt model for tokenization which can be used to obtain the tokens and their index in the corpus. After this step, every text document in the dataset is converted into sequence of tokens. 


In [10]:
tokenizer = Tokenizer()


inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[661, 118],
 [661, 118, 73],
 [661, 118, 73, 74],
 [661, 118, 73, 74, 662],
 [661, 118, 73, 74, 662, 663],
 [661, 118, 73, 74, 662, 663, 64],
 [661, 118, 73, 74, 662, 663, 64, 30],
 [661, 118, 73, 74, 662, 663, 64, 30, 211],
 [212, 664],
 [212, 664, 665]]

In the above output [30, 507], [30, 507, 11], [30, 507, 11, 1] and so on represents the ngram phrases generated from the input data. where every integer corresponds to the index of a particular word in the complete vocabulary of words present in the text. For example

**Headline:** i stand  with the shedevils  
**Ngrams:** | **Sequence of Tokens**

<table>
<tr><td>Ngram </td><td> Sequence of Tokens</td></tr>
<tr> <td>i stand </td><td> [30, 507] </td></tr>
<tr> <td>i stand with </td><td> [30, 507, 11] </td></tr>
<tr> <td>i stand with the </td><td> [30, 507, 11, 1] </td></tr>
<tr> <td>i stand with the shedevils </td><td> [30, 507, 11, 1, 975] </td></tr>
</table>



### Padding the Sequences and obtain Variables : Predictors and Target

Now that we have generated a data-set which contains sequence of tokens, it is possible that different sequences have different lengths. Before starting training the model, we need to pad the sequences and make their lengths equal. We can use pad_sequence function of Kears for this purpose. To input this data into a learning model, we need to create predictors and label. We will create N-grams sequence as predictors and the next word of the N-gram as label. For example:


Headline:  they are learning data science

<table>
<tr><td>PREDICTORS </td> <td>           LABEL </td></tr>
<tr><td>they                   </td> <td>  are</td></tr>
<tr><td>they are               </td> <td>  learning</td></tr>
<tr><td>they are learning      </td> <td>  data</td></tr>
<tr><td>they are learning data </td> <td>  science</td></tr>
</table>

In [11]:


predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)



# The model

In [12]:


model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 10)            22890     
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dense (Dense)                (None, 2289)              231189    
Total params: 298,479
Trainable params: 298,479
Non-trainable params: 0
_________________________________________________________________


Lets train our model now

In [13]:
model.fit(predictors, label, epochs=100, verbose=2)

Epoch 1/100
142/142 - 4s - loss: 7.3548
Epoch 2/100
142/142 - 0s - loss: 6.8464
Epoch 3/100
142/142 - 0s - loss: 6.7234
Epoch 4/100
142/142 - 1s - loss: 6.6350
Epoch 5/100
142/142 - 0s - loss: 6.5433
Epoch 6/100
142/142 - 1s - loss: 6.4345
Epoch 7/100
142/142 - 1s - loss: 6.3215
Epoch 8/100
142/142 - 1s - loss: 6.2053
Epoch 9/100
142/142 - 0s - loss: 6.0881
Epoch 10/100
142/142 - 0s - loss: 5.9710
Epoch 11/100
142/142 - 0s - loss: 5.8578
Epoch 12/100
142/142 - 1s - loss: 5.7509
Epoch 13/100
142/142 - 1s - loss: 5.6478
Epoch 14/100
142/142 - 1s - loss: 5.5485
Epoch 15/100
142/142 - 0s - loss: 5.4517
Epoch 16/100
142/142 - 0s - loss: 5.3560
Epoch 17/100
142/142 - 0s - loss: 5.2652
Epoch 18/100
142/142 - 0s - loss: 5.1751
Epoch 19/100
142/142 - 0s - loss: 5.0884
Epoch 20/100
142/142 - 1s - loss: 5.0050
Epoch 21/100
142/142 - 0s - loss: 4.9200
Epoch 22/100
142/142 - 0s - loss: 4.8389
Epoch 23/100
142/142 - 0s - loss: 4.7589
Epoch 24/100
142/142 - 0s - loss: 4.6804
Epoch 25/100
142/142 - 0s

<tensorflow.python.keras.callbacks.History at 0x7ff3b27fc690>



Great, our model architecture is now ready and we can train it using our data. Next lets write the function to predict the next word based on the input words (or seed text). We will first tokenize the seed text, pad the sequences and pass into the trained model to get predicted word. The multiple predicted words can be appended together to get predicted sequence.


In [14]:
print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("united states", 10, model, max_sequence_len))
print (generate_text("united states", 15, model, max_sequence_len))

United States Test For Sports Champs Would
United States Test For Sports Champs Would You Meet With Climate Change
United States Test For Sports Champs Would You Meet With Climate Change Bold Legislative Todo List On


In [15]:
print (generate_text("president trump", 3, model, max_sequence_len))
print (generate_text("president trump", 4, model, max_sequence_len))
print (generate_text("president trump", 5, model, max_sequence_len))
print (generate_text("president trump", 8, model, max_sequence_len))


President Trump Sales Fall Again
President Trump Sales Fall Again In
President Trump Sales Fall Again In Mexicos
President Trump Sales Fall Again In Mexicos Second Year Of


In [16]:
print (generate_text("joe biden", 3, model, max_sequence_len))
print (generate_text("joe biden", 4, model, max_sequence_len))
print (generate_text("joe biden", 5, model, max_sequence_len))
print (generate_text("joe biden", 8, model, max_sequence_len))

Joe Biden I Too Took
Joe Biden I Too Took An
Joe Biden I Too Took An Oath
Joe Biden I Too Took An Oath And Anthony Care


In [17]:
print (generate_text("india and china", 3, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("india and china", 5, model, max_sequence_len))
print (generate_text("india and china", 8, model, max_sequence_len))

India And China A Appetite For
India And China A Appetite For The
India And China A Appetite For The Ages
India And China A Appetite For The Ages To Win A


In [18]:
print (generate_text("european union", 3, model, max_sequence_len))
print (generate_text("european union", 4, model, max_sequence_len))
print (generate_text("european union", 5, model, max_sequence_len))
print (generate_text("european union", 8, model, max_sequence_len))

European Union I Too Took
European Union I Too Took An
European Union I Too Took An Oath
European Union I Too Took An Oath And Anthony Care
