In [1]:
#Created by Ricardo Manjarrez
import pandas as pd
df=pd.read_csv('bbc_data.csv')
print(df.columns)
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Sports have {df['labels'].value_counts().iloc[0]} instances")
print(f"Business have {df['labels'].value_counts().iloc[1]} instances")
print(f"Politics have {df['labels'].value_counts().iloc[2]} instances")
print(f"Tech have {df['labels'].value_counts().iloc[3]} instances")
print(f"entertainment have {df['labels'].value_counts().iloc[4]} instances")




Index(['data', 'labels'], dtype='object')
Number of rows: 2225
Number of columns: 2
Sports have 511 instances
Business have 510 instances
Politics have 417 instances
Tech have 401 instances
entertainment have 386 instances


In [2]:
############################################
#Split dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['data'], df['labels'], test_size=0.2, random_state=42)
print(f"Number of training instances: {X_train.shape[0]}")
print(f"Number of testing instances: {X_test.shape[0]}")


Number of training instances: 1780
Number of testing instances: 445


In [3]:
#############################################
#Load the BERT Model and Tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)  # 5 classes


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
###############################################################
#Tokenize data
import torch

def encode_data(texts, tokenizer, max_length=512):
    return tokenizer(
        list(texts),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

train_encodings = encode_data(X_train, tokenizer)
test_encodings = encode_data(X_test, tokenizer)

# Convert labels to numeric format
label_map = {label: idx for idx, label in enumerate(df['labels'].unique())}
y_train_enc = torch.tensor([label_map[label] for label in y_train])
y_test_enc = torch.tensor([label_map[label] for label in y_test])


In [5]:
###################################################################
#Use PyTorch's DataLoader for batching
from torch.utils.data import DataLoader, Dataset

class BBCDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

train_dataset = BBCDataset(train_encodings, y_train_enc)
test_dataset = BBCDataset(test_encodings, y_test_enc)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [6]:
import time

start_time = time.time()

############################################
#Train model
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

# Define scheduler
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss()

model.train()
for epoch in range(3):  # 3 epochs
    for batch in train_loader:
        batch_inputs = {key: val.to(device) for key, val in batch[0].items()}
        labels = batch[1].to(device)

        # Forward pass
        outputs = model(**batch_inputs)
        loss = loss_fn(outputs.logits, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()


end_time = time.time()
training_time = end_time - start_time
print(f"Training completed in: {training_time:.2f} seconds")


Training completed in: 503.94 seconds


In [8]:
######################################################
#Evaluate the model
from sklearn.metrics import classification_report

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch_inputs = {key: val.to(device) for key, val in batch[0].items()}
        labels = batch[1].to(device)

        outputs = model(**batch_inputs)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print(classification_report(all_labels, all_preds, target_names=label_map.keys()))


# Calculate accuracy
# Calculate accuracy
correct_predictions = sum(torch.tensor(all_preds) == torch.tensor(all_labels))
accuracy = correct_predictions.item() / len(all_labels)
print(f"Model Accuracy: {accuracy:.2%}")



               precision    recall  f1-score   support

entertainment       1.00      0.98      0.99        84
     business       0.96      0.96      0.96       103
        sport       1.00      0.99      0.99        98
     politics       0.96      0.97      0.97        80
         tech       0.96      0.99      0.98        80

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445

Model Accuracy: 97.75%
