In [85]:
print(df.columns)

Index(['Heading', 'Body', 'Category', 'URL'], dtype='object')


In [19]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping

# -------------------
# Step 1: Load & Clean Data
# -------------------

# Load dataset
df = pd.read_csv("datasets/labelled.csv")

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return text

df['Body'] = df['Body'].astype(str).apply(clean_text)
texts = df['Body'].tolist()
labels = df['Category'].tolist()

# -------------------
# Step 2: Tokenization
# -------------------

max_words = 8000
max_len = 150

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=max_len)

# Label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------
# Step 3: Load GloVe Embeddings
# -------------------

embedding_dim = 100
embedding_index = {}

with open("glove.6B.100d.txt", encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# -------------------
# Step 4: Build the Model
# -------------------

model = Sequential()
model.add(Embedding(
    input_dim=max_words,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_len,
    trainable=False
))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(np.unique(y)), activation='softmax'))

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# -------------------
# Step 5: Class Weights & Training
# -------------------

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=32,
    class_weight=class_weights_dict,
    callbacks=[early_stop]
)

# -------------------
# Step 6: Evaluation
# -------------------

loss, acc = model.evaluate(X_test, y_test)
print(f"\n✅ Final Test Accuracy: {acc:.2%}")

# -------------------
# Step 7: Save Model (optional)
# -------------------

model.save("text_classification.keras")




Epoch 1/15
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 143ms/step - accuracy: 0.3477 - loss: 1.8712 - val_accuracy: 0.6271 - val_loss: 1.1554
Epoch 2/15
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 134ms/step - accuracy: 0.6238 - loss: 1.1493 - val_accuracy: 0.6478 - val_loss: 1.1152
Epoch 3/15
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 135ms/step - accuracy: 0.6665 - loss: 1.0356 - val_accuracy: 0.6811 - val_loss: 0.9945
Epoch 4/15
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 134ms/step - accuracy: 0.6818 - loss: 0.9950 - val_accuracy: 0.5904 - val_loss: 1.2229
Epoch 5/15
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 135ms/step - accuracy: 0.6750 - loss: 1.0031 - val_accuracy: 0.7121 - val_loss: 0.9496
Epoch 6/15
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 134ms/step - accuracy: 0.7060 - loss: 0.9102 - val_accuracy: 0.7000 - val_loss: 0.9577
Epoch 7/15

In [9]:
model.save("text_classification_model.keras")



In [11]:
model.save("text_classification_model.h5")




In [69]:
def map_to_encoding(text):
    return categories.get(text, -1)  # Return -1 if not found

# Create the 'category_encoding' column
df['category_encoding'] = df['Category'].apply(map_to_encoding)

In [71]:
len(df)

11583

In [73]:
df.head()

Unnamed: 0,Heading,Body,Category,URL,category_encoding
0,free speech not hate speech madras high court ...,madras high court issue significant remark ami...,Judiciary,https://www.indiatoday.in/law/high-courts/stor...,3
1,comment take context say us cop mock indian st...,seattle police officer guild friday come defen...,Crime,https://www.indiatoday.in/world/story/indian-s...,4
2,first meeting one nation one election committe...,first official meeting one nation one election...,Politics,https://www.indiatoday.in/india/story/one-nati...,2
3,us airlines flight depressurize midair plummet...,united airlines jet head rome turn around less...,Crime,https://www.indiatoday.in/world/story/us-fligh...,4
4,terrorist kill security force foil infiltratio...,three terrorist kill infiltration bid foil sec...,Crime,https://www.indiatoday.in/india/story/one-terr...,4


In [75]:
newsArticlesBody = df['Body'].to_list()

categoryLabels = df['category_encoding'].to_list()

In [77]:
# Split the data into training, validation, and test sets (80% train, 10% val, 10% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    newsArticlesBody, categoryLabels, test_size=0.2, random_state=42, stratify = categoryLabels)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42,  stratify = test_labels)

# Display the size of each dataset
print("Train set size:", len(train_texts))
print("Validation set size:", len(val_texts))
print("Test set size:", len(test_texts))

Train set size: 9266
Validation set size: 1158
Test set size: 1159


In [79]:
clf = ak.TextClassifier(max_trials=2, metrics=['accuracy'])  # Adjust max_trials as needed
# Convert text data to numpy arrays
X_train = np.array(train_texts)
y_train = np.array(train_labels)
X_val = np.array(val_texts)
y_val = np.array(val_labels)

# Train the model
clf.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

TypeError: <class 'keras_hub.src.models.bert.bert_tokenizer.BertTokenizer'> could not be deserialized properly. Please ensure that components that are Python object instances (layers, models, etc.) returned by `get_config()` are explicitly deserialized in the model's `from_config()` method.

config={'module': 'keras_hub.src.models.bert.bert_tokenizer', 'class_name': 'BertTokenizer', 'config': {'name': 'bert_tokenizer', 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'int32'}, 'registered_name': None}, 'config_file': 'tokenizer.json', 'vocabulary': None, 'sequence_length': None, 'lowercase': True, 'strip_accents': False, 'split': True, 'suffix_indicator': '##', 'oov_token': '[UNK]', 'special_tokens': None, 'special_tokens_in_strings': False}, 'registered_name': 'keras_hub>BertTokenizer'}.

Exception encountered: Error when deserializing class 'BertTokenizer' using config={'name': 'bert_tokenizer', 'trainable': True, 'dtype': 'int32', 'config_file': 'tokenizer.json', 'vocabulary': None, 'sequence_length': None, 'lowercase': True, 'strip_accents': False, 'split': True, 'suffix_indicator': '##', 'oov_token': '[UNK]', 'special_tokens': None, 'special_tokens_in_strings': False}.

Exception encountered: BertTokenizer requires `tensorflow` and `tensorflow-text` for text processing. Run `pip install tensorflow-text` to install both packages or visit https://www.tensorflow.org/install

If `tensorflow-text` is already installed, try importing it in a clean python session. Your installation may have errors.

KerasHub uses `tf.data` and `tensorflow-text` to preprocess text on all Keras backends. If you are running on Jax or Torch, this installation does not need GPU support.