#Load Dataset

20 Newsgroups dataset is a classic benchmark for text classification and works great for multi-class problems. It includes ~20,000 newsgroup posts across 20 distinct categories

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sentence_transformers import SentenceTransformer
import xgboost as xgb
import shap

In [None]:
#Data is available on scikit
from sklearn.datasets import fetch_20newsgroups
newsgroups_data = fetch_20newsgroups(subset='all', shuffle=True, random_state=50, remove=('headers', 'footers', 'quotes'))
#Removed meta data as it will help the model focus on language patterns
texts = newsgroups_data.data       # List of news articles
labels = newsgroups_data.target    # Integer labels (0–19)
target_names = newsgroups_data.target_names  # Class names

# Data Preprocessing

In [None]:
import spacy
# SpaCy is being used in this model
# compared to nltk it is faster, more robut for NLP tasks and easier to integrate with modern ML pipelines
nlp = spacy.load('en_core_web_sm')

def cleantext(text):
  doc=nlp(text.lower())
  tokens = {
      token.lemma_ for token in doc
      if not token.is_alpha and not token.is_stop
  }
  return " ".join(tokens)
  #token.is_alpha keeps only alphabetic words
  #not token.is_stop removes common stopwords
  #token.lemma_ gets the lemmatized base form of each word

In [None]:
cleantext = [cleantext(text) for text in texts]

# Text Vectorization

I am using BERT ( Bidirectional Encoder Representation from Transformers )
It is a transformer based bi-directional NLP model

In [None]:
from transformers import BertTokenizer, BertModel, BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import numpy as np
from sklearn.model_selection import train_test_split

### Tokenize text with BERT Tokenizer

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    cleantext, labels, test_size=0.2, random_state=50
)
# Tokenize the inputs
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Structurizing tokenized data for batching and creating a PyTorch Dataset

In [None]:
from torch.utils.data import Dataset
import torch

In [None]:
class NewsGroupDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels) # Corrected the typo here from  __len__ to __len__

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsGroupDataset(train_encodings, y_train) # Create train dataset
test_dataset = NewsGroupDataset(test_encodings, y_test)   # Create test dataset

# Load BERT for Sequence Classification

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=20)
tokenizer = AutoTokenizer.from_pretrained(model_name) #Loads model and tells model to output 20 logits one for each class 0-19

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



*   BERT Model Preconfigured for classification tasks
*   Adds a connnected layer on top of final hidden state of [CLS] Token (classification token)



# Model Training

The goal of this step is to set training arguements
and initialize Trainer API

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",          # Where to save
    save_strategy="no",              # Don't save intermediate checkpoints
    num_train_epochs=5,              # Number of epochs to train
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Small batch for faster training
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=10,                # Frequent logs
    logging_dir="./logs",
    disable_tqdm=False,
    report_to="none"                 # Prevents reporting to WandB
)

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}
#To define metrics

###Create Trainer

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 2.236729860305786, 'eval_accuracy': 0.3376657824933687, 'eval_runtime': 57.2868, 'eval_samples_per_second': 65.809, 'eval_steps_per_second': 8.239, 'epoch': 5.0}


In [None]:
from sklearn.metrics import classification_report

# Get predictions
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Print report
print(classification_report(predictions.label_ids, preds))


              precision    recall  f1-score   support

           0       0.25      0.34      0.29       154
           1       0.34      0.36      0.35       206
           2       0.55      0.42      0.47       197
           3       0.50      0.37      0.43       214
           4       0.41      0.33      0.36       190
           5       0.68      0.50      0.58       222
           6       0.55      0.54      0.54       188
           7       0.28      0.45      0.34       186
           8       0.26      0.30      0.28       198
           9       0.20      0.28      0.24       216
          10       0.27      0.35      0.30       196
          11       0.26      0.28      0.27       180
          12       0.39      0.33      0.36       186
          13       0.19      0.33      0.24       195
          14       0.31      0.22      0.26       202
          15       0.40      0.35      0.37       196
          16       0.32      0.23      0.26       190
          17       0.44    