<a href="https://colab.research.google.com/github/SaatvikP/News_Classification/blob/main/News_Classification_4_Groups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install datasets tensorflow numpy scikit-learn


Collecting datasets
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
from datasets import load_dataset
import gensim.downloader as api

In [14]:
# Load Dataset (AG News)
dataset = load_dataset("ag_news")

# Extract train & test data
X_train, y_train = dataset['train']['text'], dataset['train']['label']
X_test, y_test = dataset['test']['text'], dataset['test']['label']

# Define class labels
category_labels = ["World", "Sports", "Business", "Science/Tech"]

In [15]:
MAX_VOCAB_SIZE = 10000
MAX_LENGTH = 50  # Max words per news article

# Tokenization
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_LENGTH, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_LENGTH, padding='post', truncating='post')

# Convert labels to categorical (One-hot encoding)
num_classes = len(category_labels)
y_train_enc = tf.keras.utils.to_categorical(y_train, num_classes)
y_test_enc = tf.keras.utils.to_categorical(y_test, num_classes)

In [16]:
# Load Pre-trained Word Embeddings
glove_vectors = api.load("glove-wiki-gigaword-100")

def create_embedding_matrix(tokenizer, embedding_dim=100):
    vocab_size = min(MAX_VOCAB_SIZE, len(tokenizer.word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in tokenizer.word_index.items():
        if i >= MAX_VOCAB_SIZE:
            continue
        if word in glove_vectors:
            embedding_matrix[i] = glove_vectors[word]

    return embedding_matrix

embedding_matrix = create_embedding_matrix(tokenizer, embedding_dim=100)




In [20]:
def build_model():
    lstm_units = 128
    dense_units = 64

    inputs = Input(shape=(MAX_LENGTH,))
    x = Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=100,
                  weights=[embedding_matrix], trainable=False)(inputs)

    x = Bidirectional(LSTM(lstm_units, dropout=0.4, return_sequences=False))(x)

    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(0.5)(x)

    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

model = build_model()
model.summary()

In [21]:
# Train Model
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_padded, y_train_enc,
    validation_split=0.1,
    epochs=10, batch_size=32,
    callbacks=[early_stop]
)

Epoch 1/10
[1m3375/3375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 10ms/step - accuracy: 0.8199 - loss: 0.5123 - val_accuracy: 0.8878 - val_loss: 0.3045
Epoch 2/10
[1m3375/3375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 10ms/step - accuracy: 0.8900 - loss: 0.3259 - val_accuracy: 0.8935 - val_loss: 0.2869
Epoch 3/10
[1m3375/3375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.9010 - loss: 0.2896 - val_accuracy: 0.8971 - val_loss: 0.2782
Epoch 4/10
[1m3375/3375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.9076 - loss: 0.2711 - val_accuracy: 0.9065 - val_loss: 0.2590
Epoch 5/10
[1m3375/3375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.9133 - loss: 0.2530 - val_accuracy: 0.9100 - val_loss: 0.2500
Epoch 6/10
[1m3375/3375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.9150 - loss: 0.2466 - val_accuracy: 0.9070 - val_loss: 0.2521
Epoc

In [22]:
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_enc, axis=1)

print("\n Classification Report:")
print(classification_report(y_true, y_pred, target_names=category_labels))

print("\n Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step

 Classification Report:
              precision    recall  f1-score   support

       World       0.92      0.91      0.92      1900
      Sports       0.97      0.96      0.97      1900
    Business       0.88      0.87      0.87      1900
Science/Tech       0.87      0.90      0.88      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600


 Confusion Matrix:
[[1726   31   79   64]
 [  32 1833   20   15]
 [  68    7 1644  181]
 [  46   11  134 1709]]


In [23]:
# Making Predictions
def predict_category(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LENGTH, padding='post', truncating='post')
    pred_probs = model.predict(padded)[0]
    pred_category_idx = np.argmax(pred_probs)
    return category_labels[pred_category_idx]

sample_news = [
    "NASA launches a satellite into orbit.",
    "Bitcoin prices surge after market speculation.",
    "The Lakers win their final game of the season.",
    "The government announces new trade policies."
]

for text in sample_news:
    print(f"News: {text}\nPredicted Category: {predict_category(text)}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
News: NASA launches a satellite into orbit.
Predicted Category: Science/Tech

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
News: Bitcoin prices surge after market speculation.
Predicted Category: Business

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
News: The Lakers win their final game of the season.
Predicted Category: Sports

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
News: The government announces new trade policies.
Predicted Category: Business

