In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
import re

In [2]:
df = pd.read_csv('test.csv')
df = df.dropna()

In [3]:
def clean(text):
  text = str(text).lower()
  text = re.sub(r"[^a-z0-9\s]", " ", text)
  return " ".join(text.split())
df['clean_text'] = df['text'].apply(clean)

In [4]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
num_classes = len(label_encoder.classes_)

In [5]:
MAX_VOCAB = 10000
SEQ_LEN = 40
BATCH = 32

vectorizer = layers.TextVectorization(max_tokens = MAX_VOCAB, output_sequence_length = SEQ_LEN)
vectorizer.adapt(df['clean_text'])

train_texts = df['clean_text'].values
train_labels = df['label'].values

train_ds = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
train_ds = train_ds.shuffle(1000).batch(BATCH).map(lambda x,y: (vectorizer(x), y)).prefetch(tf.data.AUTOTUNE)

In [6]:
model = models.Sequential([layers.Embedding(MAX_VOCAB, 64),
                           layers.GlobalAveragePooling1D(),
                           layers.Dense(64, activation = "relu"),
                           layers.Dense(num_classes, activation="softmax")
                           ])
model.compile(optimizer= 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()


In [7]:
EPOCHS = 10
model.fit(train_ds, epochs = EPOCHS)

Epoch 1/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.5371 - loss: 1.2154
Epoch 2/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8887 - loss: 0.3673
Epoch 3/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9346 - loss: 0.2080
Epoch 4/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9680 - loss: 0.1161
Epoch 5/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9841 - loss: 0.0671
Epoch 6/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9936 - loss: 0.0322
Epoch 7/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9974 - loss: 0.0192
Epoch 8/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9998 - loss: 0.0091
Epoch 9/10
[1m238/238[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x7f6faf435520>

In [8]:
test_headlines = [ "India wins world cup",
                  "NASA discovers new planet",
                   "Stock markets fall due to inflation",
                   "New AI model beats all benchmarks",
                   "Cristiano Ronaldo becomes the topscorer in football history",
                   "Shamikh scores a hattrick in his debut football match",
                   "Planck epoch Institute of Professional studies offers internship to graduated students",
                   "Car crashes due to speeding and 2 people were injured",
                   "A local store victim to a robbery attempt",
                   "Two civilians brutally killed by an armed robber",
                   "Value of Indian ruppee falls to 90 against a US Dollar"]

def clean(s):
  import re
  s = s.lower()
  s = re.sub(r"[^a-z0-9\s]", " ", s)
  return " ".join(s.split())

cleaned_test = [clean(x) for x in test_headlines]

X = vectorizer(cleaned_test)

label_map = {
    0: "Sports",
    1: "Business",
    2: "Economy",
    3: "Science/Technology",
    4: "Crime"
}

pred = model.predict(X)
pred_classes = pred.argmax(axis =1)
predicted_labels = [label_encoder.inverse_transform([cls])[0] for cls in pred_classes]

# Print results
for text, cls in zip(test_headlines, predicted_labels):
    print(f"{text}  --->  {label_map[cls]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
India wins world cup  --->  Sports
NASA discovers new planet  --->  Science/Technology
Stock markets fall due to inflation  --->  Economy
New AI model beats all benchmarks  --->  Science/Technology
Cristiano Ronaldo becomes the topscorer in football history  --->  Science/Technology
Shamikh scores a hattrick in his debut football match  --->  Sports
Planck epoch Institute of Professional studies offers internship to graduated students  --->  Science/Technology
Car crashes due to speeding and 2 people were injured  --->  Sports
A local store victim to a robbery attempt  --->  Sports
Two civilians brutally killed by an armed robber  --->  Sports
Value of Indian ruppee falls to 90 against a US Dollar  --->  Economy
