In [1]:
import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing import text, sequence
import json
import pandas as pd
import nltk
import numpy as np

In [None]:
df = pd.read_json("D:/Intern/DataSets/News_Category_Dataset_v3.json", lines=True)

In [3]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
df.shape

(209527, 6)

In [5]:
data = list(df['headline'].values)

In [6]:
len(data)

209527

In [7]:
sample_data = data[:1000]

In [8]:
import re
from nltk.corpus import stopwords
lemmatizer = nltk.stem.WordNetLemmatizer()
corpus = []

for i in range(len(sample_data)):
    review = re.sub('[^a-zA-Z]', ' ', data[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)    

In [9]:
tokenizer = text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(sample_data)

total_words = len(tokenizer.word_index) + 1
print("voacb size", total_words)
print("word index", tokenizer.word_index)

voacb size 4469
word index {'to': 1, 'in': 2, 'the': 3, 'of': 4, 'for': 5, 'on': 6, 'at': 7, 'a': 8, 'after': 9, 'and': 10, 'with': 11, 'says': 12, 'is': 13, 'trump': 14, 'from': 15, 'as': 16, 'new': 17, 'this': 18, 'ukraine': 19, 'by': 20, 'biden': 21, 'russian': 22, 'over': 23, 'about': 24, '6': 25, 'u': 26, 's': 27, 'abortion': 28, 'how': 29, 'out': 30, 'court': 31, 'it': 32, 'ex': 33, 'was': 34, 'he': 35, 'his': 36, 'up': 37, 'shooting': 38, 'will': 39, 'covid': 40, 'her': 41, 'man': 42, 'jan': 43, 'be': 44, '4': 45, 'who': 46, 'police': 47, 'week': 48, 'house': 49, 'school': 50, 'gop': 51, 'has': 52, 'that': 53, 'twitter': 54, 'tweets': 55, 'supreme': 56, 'funniest': 57, '3': 58, 'are': 59, '1': 60, 'rep': 61, 'dead': 62, 'into': 63, 'time': 64, 'against': 65, 'report': 66, 'an': 67, 'fire': 68, 'russia': 69, 'if': 70, 'first': 71, 'more': 72, 'white': 73, 'have': 74, "'": 75, 'killed': 76, 'star': 77, 'parents': 78, 'day': 79, 'still': 80, 'rights': 81, 'georgia': 82, 'no': 83, '

In [10]:
# sequence
input_sequence = []

for line in sample_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequence.append(n_gram_sequence)

In [11]:
input_sequence[:10]

[[23, 45],
 [23, 45, 85],
 [23, 45, 85, 241],
 [23, 45, 85, 241, 382],
 [23, 45, 85, 241, 382, 37],
 [23, 45, 85, 241, 382, 37, 1540],
 [23, 45, 85, 241, 382, 37, 1540, 5],
 [23, 45, 85, 241, 382, 37, 1540, 5, 1541],
 [23, 45, 85, 241, 382, 37, 1540, 5, 1541, 556],
 [23, 45, 85, 241, 382, 37, 1540, 5, 1541, 556, 40]]

In [12]:
max_len = max([len(x) for x in input_sequence])
max_len

19

In [13]:
# pad the sequences
input_sequences = sequence.pad_sequences(input_sequence, maxlen=max_len, padding='pre')

In [14]:
# split into input (X) and output (y)
X = input_sequences[:,:-1]
y = input_sequences[:,-1]

In [15]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [16]:
print("input shape", X.shape)
print("output shape", y.shape)

input shape (10241, 18)
output shape (10241, 4469)


In [None]:
model = models.Sequential([
	layers.Embedding(total_words, 100, input_length=max_len-1),
	layers.Bidirectional(layers.SimpleRNN(150, activation='tanh')),
	layers.Dense(total_words, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [25]:
model.summary()

In [19]:
model.fit(X, y, epochs=50, verbose=1, validation_batch_size=0.2)

Epoch 1/50
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.0258 - loss: 8.0645
Epoch 2/50
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.0299 - loss: 7.4134
Epoch 3/50
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.0393 - loss: 6.8828
Epoch 4/50
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0541 - loss: 6.2938
Epoch 5/50
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.0774 - loss: 5.6884
Epoch 6/50
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.1251 - loss: 5.0759
Epoch 7/50
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.2184 - loss: 4.4547
Epoch 8/50
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.3399 - loss: 3.8377
Epoch 9/50
[1m321/321[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x23c47421300>

In [20]:
loss, acc = model.evaluate(X, y)
print("Loss : ", loss)
print("Accuracy : ", acc)

[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9594 - loss: 0.1036
Loss :  0.10359061509370804
Accuracy :  0.9593789577484131


In [21]:
def generate_text(seed_text, n_words=5):
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = sequence.pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=1)
    
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [22]:
print(sample_data[:10])

['Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters', 'American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video', '23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23)', 'The Funniest Tweets From Parents This Week (Sept. 17-23)', 'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer', 'Cleaner Was Dead In Belk Bathroom For 4 Days Before Body Found: Police', 'Reporter Gets Adorable Surprise From Her Boyfriend While Live On TV', 'Puerto Ricans Desperate For Water After Hurricane Fiona’s Rampage', 'How A New Documentary Captures The Complexity Of Being A Child Of Immigrants', "Biden At UN To Call Russian War An Affront To Body's Charter"]


In [23]:
seed_text = "cleaner dead"
print(generate_text(seed_text, n_words=10))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Cleaner Dead After Russian Bomb When Thinking About Jan 6 Committee Obtains


In [24]:
seed_text = "Trump said"
print(generate_text(seed_text, n_words=10))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Trump Said Wounded Veterans In Military Parades Didn’T 'Look Good' For Him
