<a href="https://colab.research.google.com/github/SaatvikP/News_Classification/blob/main/News_Classification_20_Groups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install tensorflow numpy scikit-learn




In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import gensim.downloader as api


In [13]:
# Load Dataset (20 Newsgroups)
categories = [
    'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
    'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
    'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
    'talk.politics.misc', 'talk.religion.misc'
]

newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X_texts = newsgroups.data  # List of news articles
y_labels = newsgroups.target  # Numerical labels
category_labels = newsgroups.target_names  # Label names

# Split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X_texts, y_labels, test_size=0.2, random_state=42)


In [14]:
MAX_VOCAB_SIZE = 20000  # Max number of unique words
MAX_LENGTH = 200  # Max words per document

# Tokenization
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_LENGTH, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_LENGTH, padding='post', truncating='post')

# Convert labels to categorical (One-hot encoding)
num_classes = len(category_labels)
y_train_enc = tf.keras.utils.to_categorical(y_train, num_classes)
y_test_enc = tf.keras.utils.to_categorical(y_test, num_classes)

In [15]:
print("🔄 Loading GloVe embeddings...")
glove_vectors = api.load("glove-wiki-gigaword-100")

def create_embedding_matrix(tokenizer, embedding_dim=100):
    vocab_size = min(MAX_VOCAB_SIZE, len(tokenizer.word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in tokenizer.word_index.items():
        if i >= MAX_VOCAB_SIZE:
            continue
        if word in glove_vectors:
            embedding_matrix[i] = glove_vectors[word]

    return embedding_matrix

embedding_matrix = create_embedding_matrix(tokenizer, embedding_dim=100)


🔄 Loading GloVe embeddings...


In [16]:
# BiLSTM Model (BiLSTM)
def build_model():
    lstm_units = 128
    dense_units = 64

    inputs = Input(shape=(MAX_LENGTH,))
    x = Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=100,
                  weights=[embedding_matrix], trainable=False)(inputs)

    x = Bidirectional(LSTM(lstm_units, dropout=0.4, return_sequences=False))(x)

    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(0.5)(x)

    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

print("🚀 Building Model...")
model = build_model()
model.summary()


🚀 Building Model...


In [17]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

print("🔄 Training Model...")
history = model.fit(
    X_train_padded, y_train_enc,
    validation_split=0.1,
    epochs=10, batch_size=32,
    callbacks=[early_stop]
)

🔄 Training Model...
Epoch 1/10
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.0755 - loss: 2.9442 - val_accuracy: 0.2042 - val_loss: 2.4156
Epoch 2/10
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.2038 - loss: 2.3753 - val_accuracy: 0.3190 - val_loss: 2.0209
Epoch 3/10
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.3095 - loss: 2.0261 - val_accuracy: 0.4224 - val_loss: 1.7407
Epoch 4/10
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.4001 - loss: 1.7973 - val_accuracy: 0.4741 - val_loss: 1.5708
Epoch 5/10
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.4675 - loss: 1.6078 - val_accuracy: 0.4973 - val_loss: 1.5067
Epoch 6/10
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - accuracy: 0.5096 - loss: 1.5374 - val_accuracy: 0.5398 - val_loss: 1.4484
E

In [18]:
# Evaluate Model

print("🔍 Evaluating Model...")
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_enc, axis=1)

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=category_labels))

print("\n📊 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

🔍 Evaluating Model...
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step

📊 Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.19      0.16      0.17       151
           comp.graphics       0.50      0.50      0.50       202
 comp.os.ms-windows.misc       0.46      0.41      0.43       195
comp.sys.ibm.pc.hardware       0.33      0.65      0.44       183
   comp.sys.mac.hardware       0.34      0.12      0.18       205
          comp.windows.x       0.56      0.45      0.50       215
            misc.forsale       0.65      0.58      0.62       193
               rec.autos       0.60      0.65      0.62       196
         rec.motorcycles       0.37      0.58      0.45       168
      rec.sport.baseball       0.76      0.73      0.74       211
        rec.sport.hockey       0.75      0.85      0.80       198
               sci.crypt       0.69      0.56      0.62       201
         sci.elect

In [19]:
# Making Predictions
# ============================
def predict_category(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LENGTH, padding='post', truncating='post')
    pred_probs = model.predict(padded)[0]
    pred_category_idx = np.argmax(pred_probs)
    return category_labels[pred_category_idx]

# Test with some sample news articles
sample_news = [
    "Apple unveils the new MacBook Pro with M3 chip.",
    "The Lakers won a thrilling game in overtime last night.",
    "NASA launches a new Mars rover to explore the red planet.",
    "The US government announces new policies on climate change.",
    "Bitcoin prices are soaring after a new regulation change."
]

print("\n🔍 Testing Predictions...\n")
for text in sample_news:
    print(f"News: {text}\nPredicted Category: {predict_category(text)}\n")


🔍 Testing Predictions...

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
News: Apple unveils the new MacBook Pro with M3 chip.
Predicted Category: sci.electronics

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
News: The Lakers won a thrilling game in overtime last night.
Predicted Category: rec.sport.hockey

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
News: NASA launches a new Mars rover to explore the red planet.
Predicted Category: sci.space

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
News: The US government announces new policies on climate change.
Predicted Category: talk.politics.misc

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
News: Bitcoin prices are soaring after a new regulation change.
Predicted Category: rec.autos

