# 1 Data Preparation

In [None]:
# Load the dataset from Kaggle

import pandas as pd
import json

data = []
data_path = "/content/News_Category_Dataset_v3.json"

with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            data.append(json.loads(line))
        except ValueError:
            print("Skipping a bad line...")

df = pd.DataFrame(data)
print(f"Loaded {df.shape[0]} rows.")
df.head()


Loaded 209527 rows.


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [None]:
# Combine headline + short_description as the input text

df["text"] = df["headline"].fillna("") + " " + df["short_description"].fillna("")

In [None]:
# Encode category labels properly

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["category"])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_mapping


{'ARTS': np.int64(0),
 'ARTS & CULTURE': np.int64(1),
 'BLACK VOICES': np.int64(2),
 'BUSINESS': np.int64(3),
 'COLLEGE': np.int64(4),
 'COMEDY': np.int64(5),
 'CRIME': np.int64(6),
 'CULTURE & ARTS': np.int64(7),
 'EDUCATION': np.int64(8),
 'ENTERTAINMENT': np.int64(9),
 'ENVIRONMENT': np.int64(10),
 'FIFTY': np.int64(11),
 'FOOD & DRINK': np.int64(12),
 'GOOD NEWS': np.int64(13),
 'GREEN': np.int64(14),
 'HEALTHY LIVING': np.int64(15),
 'HOME & LIVING': np.int64(16),
 'IMPACT': np.int64(17),
 'LATINO VOICES': np.int64(18),
 'MEDIA': np.int64(19),
 'MONEY': np.int64(20),
 'PARENTING': np.int64(21),
 'PARENTS': np.int64(22),
 'POLITICS': np.int64(23),
 'QUEER VOICES': np.int64(24),
 'RELIGION': np.int64(25),
 'SCIENCE': np.int64(26),
 'SPORTS': np.int64(27),
 'STYLE': np.int64(28),
 'STYLE & BEAUTY': np.int64(29),
 'TASTE': np.int64(30),
 'TECH': np.int64(31),
 'THE WORLDPOST': np.int64(32),
 'TRAVEL': np.int64(33),
 'U.S. NEWS': np.int64(34),
 'WEDDINGS': np.int64(35),
 'WEIRD NEWS': 

In [None]:
# Split data into training and validation sets

from sklearn.model_selection import train_test_split

X = df["text"].values
y = df["label"].values

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))

Train size: 94829
Validation size: 23708


# 2 Model Selection

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import os
import warnings
from transformers import logging

os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
warnings.filterwarnings("ignore")
logging.set_verbosity_error()



MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
num_labels = len(label_encoder.classes_)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

print(model.config)

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": null,
  "dim": 768,
  "dropout": 0.1,
  "dtype": "float32",
  "eos_token_id": null,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_31",
    "32": "LABEL_32",
    "33": "LABEL_33",
    "34

# 3 Tokenization

In [None]:
def tokenize_texts(texts):
    return tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [None]:
train_encodings = tokenize_texts(X_train)
val_encodings = tokenize_texts(X_val)

In [None]:
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = NewsDataset(train_encodings, y_train)
val_dataset = NewsDataset(val_encodings, y_val)

# 4 Model Training (Fine-Tuning)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    report_to="tensorboard",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "f1": f1
    }

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

{'loss': '0.1671', 'grad_norm': '5.406', 'learning_rate': '1.997e-05', 'epoch': '0.001687'}
{'loss': '0.3262', 'grad_norm': '5.989', 'learning_rate': '1.994e-05', 'epoch': '0.003374'}
{'loss': '0.2966', 'grad_norm': '5.135', 'learning_rate': '1.99e-05', 'epoch': '0.005062'}
{'loss': '0.2622', 'grad_norm': '5.171', 'learning_rate': '1.987e-05', 'epoch': '0.006749'}
{'loss': '0.3381', 'grad_norm': '6.299', 'learning_rate': '1.983e-05', 'epoch': '0.008436'}
{'loss': '0.3155', 'grad_norm': '3.355', 'learning_rate': '1.98e-05', 'epoch': '0.01012'}
{'loss': '0.273', 'grad_norm': '7.516', 'learning_rate': '1.977e-05', 'epoch': '0.01181'}
{'loss': '0.4178', 'grad_norm': '11.28', 'learning_rate': '1.973e-05', 'epoch': '0.0135'}
{'loss': '0.3192', 'grad_norm': '6.79', 'learning_rate': '1.97e-05', 'epoch': '0.01518'}
{'loss': '0.3286', 'grad_norm': '8.56', 'learning_rate': '1.967e-05', 'epoch': '0.01687'}
{'loss': '0.3214', 'grad_norm': '7.52', 'learning_rate': '1.963e-05', 'epoch': '0.01856'}
{'

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

{'train_runtime': '1115', 'train_samples_per_second': '85.02', 'train_steps_per_second': '5.314', 'train_loss': '0.7154', 'epoch': '1'}


TrainOutput(global_step=5927, training_loss=0.7154368071848131, metrics={'train_runtime': 1115.3377, 'train_samples_per_second': 85.023, 'train_steps_per_second': 5.314, 'train_loss': 0.7154368071848131, 'epoch': 1.0})

In [None]:
# Save trained model and tokenizer

model.save_pretrained("model/classifier")
tokenizer.save_pretrained("model/tokenizer")

print("Model and tokenizer saved successfully.")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model and tokenizer saved successfully.


In [None]:
import json

label_map = {int(v): k for k, v in label_mapping.items()}

with open("model/label_map.json", "w") as f:
    json.dump(label_map, f)

# 5 Evaluation

In [None]:
eval_results = trainer.evaluate()
eval_results

{'eval_loss': '1.067', 'eval_accuracy': '0.6928', 'eval_f1': '0.685', 'eval_runtime': '87.21', 'eval_samples_per_second': '271.8', 'eval_steps_per_second': '16.99', 'epoch': '1'}


{'eval_loss': 1.0674558877944946,
 'eval_accuracy': 0.6928462966087396,
 'eval_f1': 0.6850348824069141,
 'eval_runtime': 87.2109,
 'eval_samples_per_second': 271.847,
 'eval_steps_per_second': 16.993,
 'epoch': 1.0}

In [None]:
import numpy as np

predictions = trainer.predict(val_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_val, y_pred)
f1_weighted = f1_score(y_val, y_pred, average="weighted")

print("Validation Accuracy:", accuracy)
print("Validation F1 (Weighted):", f1_weighted)

Validation Accuracy: 0.6928462966087396
Validation F1 (Weighted): 0.6850348824069141


In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_val, y_pred)
cm

array([[100,   4,   5, ...,   3,   0,   5],
       [  4, 140,  13, ...,  11,   2,   0],
       [  3,  10, 362, ...,   9,   2,   1],
       ...,
       [  1,   8,   9, ..., 265,   7,   0],
       [  0,   4,   1, ...,   0, 355,   4],
       [  5,   1,   1, ...,   2,   8, 223]])