In [1]:
import tensorflow as tf
from tensorflow.keras import models, layers, preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json("D:/Intern/DataSets/News_Category_Dataset_v3.json", lines=True)
data = df[["headline", "category"]]

In [3]:
data.head()

Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [4]:
data.shape

(209527, 2)

In [5]:
sample_data = data.sample(50000, random_state=42).reset_index(drop=True)

In [6]:
sample_data.head()

Unnamed: 0,headline,category
0,What If We Were All Family Generation Changers?,IMPACT
1,Firestorm At AOL Over Employee Benefit Cuts,BUSINESS
2,Dakota Access Protesters Arrested As Deadline ...,POLITICS
3,One Glimpse Of These Baby Kit Foxes And You'll...,GREEN
4,"Mens' Sweat Pheromone, Androstadienone, Influe...",SCIENCE


In [7]:
max_words = 10000

In [8]:
tokenizer = preprocessing.text.Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(sample_data["headline"])

In [9]:
sequence = tokenizer.texts_to_sequences(sample_data["headline"])
max_len = max([len(seq) for seq in sequence])
print(max_len)

28


In [10]:
pad_data = preprocessing.sequence.pad_sequences(sequences=sequence, maxlen=max_len, padding='post')

In [11]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(sample_data["category"])

num_classes = len(label_encoder.classes_)
print(num_classes)
y = tf.keras.utils.to_categorical(y, num_classes=num_classes) # One Hot encoding - [0, 0, ... 1]

42


In [12]:
X_train, X_test, y_train, y_test = train_test_split(pad_data, y, test_size=0.2, random_state=42)

In [13]:
model = models.Sequential([
	layers.Embedding(input_dim = min(max_words, len(tokenizer.word_index) + 1), 
                  		output_dim = 128, 
                    	input_length = max_len),
	layers.LSTM(128, activation='tanh', return_sequences=True, recurrent_dropout=0.3),
	layers.LSTM(64, activation='tanh', recurrent_dropout=0.3),
	layers.Dense(num_classes, activation='softmax')
])



In [14]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [15]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='auto', restore_best_weights=True)

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), callbacks=[early_stop])

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 32ms/step - accuracy: 0.2250 - loss: 3.0717 - val_accuracy: 0.2658 - val_loss: 2.8832
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 35ms/step - accuracy: 0.3214 - loss: 2.6468 - val_accuracy: 0.3395 - val_loss: 2.5261
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 32ms/step - accuracy: 0.4058 - loss: 2.2448 - val_accuracy: 0.4267 - val_loss: 2.2435
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 32ms/step - accuracy: 0.5026 - loss: 1.8791 - val_accuracy: 0.4609 - val_loss: 2.1094
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 33ms/step - accuracy: 0.5644 - loss: 1.6023 - val_accuracy: 0.4749 - val_loss: 2.0702
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 30ms/step - accuracy: 0.6193 - loss: 1.3831 - val_accuracy: 0.4841 - val_loss: 2.0863
Epoc

<keras.src.callbacks.history.History at 0x2384f905270>

In [16]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Loss: {loss}, Accuracy: {acc}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.4749 - loss: 2.0702
Loss: 2.0701723098754883, Accuracy: 0.4749000072479248


In [17]:
def predict_category(model, tokenizer, text, max_len, label_encoder):
	seq = tokenizer.texts_to_sequences([text])
	pad_seq = preprocessing.sequence.pad_sequences(sequences=seq, maxlen=max_len, padding='post')
	pred = model.predict(pad_seq, verbose=0)
	category = label_encoder.inverse_transform([np.argmax(pred)])[0]
	return category

In [18]:
print(data["category"].unique())

['U.S. NEWS' 'COMEDY' 'PARENTING' 'WORLD NEWS' 'CULTURE & ARTS' 'TECH'
 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WEIRD NEWS' 'ENVIRONMENT'
 'EDUCATION' 'CRIME' 'SCIENCE' 'WELLNESS' 'BUSINESS' 'STYLE & BEAUTY'
 'FOOD & DRINK' 'MEDIA' 'QUEER VOICES' 'HOME & LIVING' 'WOMEN'
 'BLACK VOICES' 'TRAVEL' 'MONEY' 'RELIGION' 'LATINO VOICES' 'IMPACT'
 'WEDDINGS' 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'THE WORLDPOST' 'GOOD NEWS' 'WORLDPOST' 'FIFTY' 'ARTS'
 'DIVORCE']


In [19]:
test_headlines = [
    "President signs new bill on climate change",
    "Manchester United wins Champions League",
    "New iPhone 15 Pro Max launched today",
]

In [20]:
for headline in test_headlines:
	category = predict_category(model, tokenizer, headline, max_len, label_encoder)
	print(f"Headline: {headline}, \nPredicted category: {category}")

Headline: President signs new bill on climate change, 
Predicted category: POLITICS
Headline: Manchester United wins Champions League, 
Predicted category: SPORTS
Headline: New iPhone 15 Pro Max launched today, 
Predicted category: TECH
