In [1]:
import pandas as pd
import numpy as np
import string, os
import random

In [2]:
df = pd.read_csv("../input/million-headlines/abcnews-date-text.csv")
df.head()

In [3]:
headlines = [h for h in df['headline_text']]
print("We have", len(headlines), "unique headline in the dataset.")

The work is done in the following steps:
- step 01: Creating the corpus by removing punctuations and lower caseing the headlines <br>
- step 02: We shuffle the dataset and test on the first 2000 lines, if all goes as planned, split the dataset and wait.<br>
- step 03: Tokenization, define a funtion that takes the corpus and return the token list and total words processed. (for generalizazions)<br>
- step 04: Padding (pre), define a function that takes the tojen list held in step 03 and return predictors, labels and an max sentence length. (for generalizazions)<br>
- step 05: Model creating, we will make a simple 3 layers model, the first one will be the ebmedding layer, second coes the LSTM layer (hidden) and finally a sofmax activation layer, the loss will be "crossentropy" (categorical since multiple, countable), the optimizer will be "adam" (most recommended)<br>
- step 06: Fit the model and wait, we will do 32 batches and work over 120 epochs.<br>
- step 07: Create a funtion that takes a seed and return a sequence of a fixed length using the model.

In [4]:
#Step 01.

def clean_text(txt):
    txt = "".join(x for x in txt if x not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(h) for h in headlines]

In [5]:
#Step 02.

import random
random.shuffle(corpus)
corpus = corpus[: 20000]

In [6]:
"""from typing import List

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
corpusx= list(chunks(corpus, 20000))

after defining functions use a for loop.

In tokenizazion for example
inpx_sequences = []
totalx_words = []
    
for x in range(0, len(corpusx)):
    inp_sequences, total_words = get_sequence_of_tokens(corpusx[x])
    inpx_sequences.append(inp_sequences)
    totalx_words.append(total_words)"""

In [7]:
#Step 03.

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [8]:
#Step 04.

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
    
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [9]:
#Step 05.

from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.models import Sequential


def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

In [10]:
model.fit(predictors, label, batch_size=32, epochs=120, verbose=5)

In [11]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], 
                                   maxlen=max_sequence_len-1, 
                                   padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=1) 
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [1]:
print (generate_text("police", 7, model, max_sequence_len))