In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import string


from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

2023-05-07 22:58:11.709270: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# df = pd.read_csv('data/ArticlesApril2017.csv')
# len(df.headline.values)


curr_directory = 'data/'
all_headlines = []
for filename in os.listdir(curr_directory):
    if 'Article' in filename:
        # print(filename)
        df = pd.read_csv(curr_directory + filename)
        all_headlines.extend(df.headline.values)
        break

all_headlines = [headline for headline in all_headlines if headline != 'Unknown' ]
len(all_headlines)

829

In [3]:
all_headlines[:10]

['N.F.L. vs. Politics Has Been Battle All Season Long',
 'Voice. Vice. Veracity.',
 'A Stand-Up’s Downward Slide',
 'New York Today: A Groundhog Has Her Day',
 'A Swimmer’s Communion With the Ocean',
 'Trail Activity',
 'Super Bowl',
 'Trump’s Mexican Shakedown',
 'Pence’s Presidential Pet',
 'Fruit of a Poison Tree']

# PreProcessing

In [4]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode('utf-8').decode('ascii','ignore')
    return txt

corpus = [clean_text(txt) for txt in all_headlines]
corpus[0:10]

['nfl vs politics has been battle all season long',
 'voice vice veracity',
 'a standups downward slide',
 'new york today a groundhog has her day',
 'a swimmers communion with the ocean',
 'trail activity',
 'super bowl',
 'trumps mexican shakedown',
 'pences presidential pet',
 'fruit of a poison tree']

# N-Gram Tokens Padded and Target Variables

In [5]:
tk = Tokenizer()

def token_generator(corpus):
    tk.fit_on_texts(corpus)
    vocab_size = len(tk.word_index)+1
    
    input_sequence = []
    for seq in corpus:
        seq_list = tk.texts_to_sequences([seq])[0]
        for i in range(1,len(seq_list)):
            input_sequence.append(seq_list[:i+1])
        
    return input_sequence,vocab_size
    

input_sequence,vocab_size = token_generator(corpus)
print(vocab_size)
input_sequence[0:10]

2288


[[660, 117],
 [660, 117, 72],
 [660, 117, 72, 73],
 [660, 117, 72, 73, 661],
 [660, 117, 72, 73, 661, 662],
 [660, 117, 72, 73, 661, 662, 63],
 [660, 117, 72, 73, 661, 662, 63, 29],
 [660, 117, 72, 73, 661, 662, 63, 29, 210],
 [211, 663],
 [211, 663, 664]]

In [8]:
def padded_data_generator(input_sequence):
    max_len = max([len(x) for x in input_sequence])
    input_sequence = pad_sequences(input_sequence,maxlen=max_len)
    
    data,target = input_sequence[:,:-1],input_sequence[:,-1]
    # target = to_categorical(target)
    return data, target, max_len

data,target,seq_length = padded_data_generator(input_sequence)

In [9]:
target.shape

(4544,)

# LSTM Network

In [10]:
model = Sequential([
    Embedding(vocab_size,10, input_length=seq_length-1),
    # Dropout(0.2),
    LSTM(units=100, dropout=0.1),
    Dense(vocab_size, activation = 'softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 10)            22880     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dense (Dense)               (None, 2288)              231088    
                                                                 
Total params: 298368 (1.14 MB)
Trainable params: 298368 (1.14 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
hist = model.fit(data,target,epochs = 100, batch_size = 32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [32]:
def text_generation(seed_text,number_of_words_req,seq_length,model):
    for _ in range(number_of_words_req):
        token_list = tk.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],seq_length-1)
        y_pred = np.argmax(model.predict(token_list,verbose = 0))

        # print(y_pred)
        output = ""
        for word,index in tk.word_index.items():
            if(y_pred == index):
                # print(word)
                output = word
                break

        seed_text += " " + word
        # print(seed_text.title())
    return seed_text.title()
    
print(text_generation("united states",5,seq_length,model))
print(text_generation("india",7,seq_length,model))
print(text_generation("beautiful",4,seq_length,model))
print(text_generation("dog",5,seq_length,model))
print(text_generation("hello",5,seq_length,model))


United States Regulation Limit The Light Coming
India In The Bookshelf Winemaker Put Make In
Beautiful Mailbag Why Treat A
Dog I Is The Jews Board
Hello Question In A Dusty Bottle
