In [None]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import re
import random
from tqdm import tqdm  # For the progress bar

In [None]:
from transformers import pipeline
import pandas as pd
import re
from tqdm import tqdm
import plotly.express as px
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message="Length of IterableDataset")

In [None]:
from google.colab import files
uploaded = files.upload()


Saving articles_with_political_leaning.csv to articles_with_political_leaning.csv


In [None]:
df = pd.read_csv('articles_with_political_leaning.csv')

In [None]:
# Drop rows with missing required fields
df = df.dropna(subset=['Preprocessed_Text', 'Political_Leaning'])

# Normalize label text
df['Political_Leaning'] = df['Political_Leaning'].str.lower().str.strip()

# Updated label map (lowercase keys)
label_map = {
    'left-leaning': 0,
    'centrist/neutral': 1,
    'right-leaning': 2
}

# Filter and map
df = df[df['Political_Leaning'].isin(label_map.keys())]
df['label'] = df['Political_Leaning'].map(label_map)

# Check result
print(df.shape)
print(df['Political_Leaning'].value_counts())


(4338, 12)
Political_Leaning
right-leaning       3107
left-leaning         665
centrist/neutral     566
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Preprocessed_Text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']  # keeps class proportions the same
)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch

class NewsBiasDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsBiasDataset(train_encodings, train_labels)
val_dataset = NewsBiasDataset(val_encodings, val_labels)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)

# Convert to tensor for use in the loss function
class_weights = torch.tensor(class_weights, dtype=torch.float)


In [None]:
from transformers import BertForSequenceClassification
from torch.nn import CrossEntropyLoss

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define the custom loss function using class weights
def compute_loss_with_weights(model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))  # Apply class weights here
    loss = loss_fct(outputs.logits, labels)
    return (loss, outputs) if return_outputs else loss


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Get the device of the model (cuda or cpu)
        device = model.device

        # Compute loss using CrossEntropyLoss
        labels = inputs["labels"]

        # Make sure the weights tensor is on the same device as the model
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 3.0, 1.0]).to(device))  # Example class weights
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



# Define training arguments (remove 'evaluation_strategy' for older versions)
training_args = TrainingArguments(
    output_dir='./results',               # Output directory
    num_train_epochs=3,                   # Number of epochs
    per_device_train_batch_size=8,        # Batch size per device during training
    per_device_eval_batch_size=8,         # Batch size per device during evaluation
    save_strategy="epoch",                # Save the model every epoch
    logging_dir='./logs',                 # Directory for logs
    logging_steps=10,                     # Log every 10 steps
    weight_decay=0.01,                    # Weight decay for regularization
    logging_first_step=True,              # Log the first step
)

# Initialize the custom trainer
trainer = CustomTrainer(
    model=model,                          # The model to train
    args=training_args,                   # Training arguments
    train_dataset=train_dataset,          # Training dataset
    eval_dataset=val_dataset,             # Validation dataset
)

# Start training
trainer.train()


In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Get the device of the model (cuda or cpu)
        device = model.device

        # Compute loss using CrossEntropyLoss
        labels = inputs["labels"]

        # Make sure the weights tensor is on the same device as the model
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 3.0, 1.0]).to(device))  # Example class weights
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



# Define training arguments (remove 'evaluation_strategy' for older versions)
training_args = TrainingArguments(
    output_dir='./results',               # Output directory
    num_train_epochs=10,                   # Number of epochs
    per_device_train_batch_size=8,        # Batch size per device during training
    per_device_eval_batch_size=8,         # Batch size per device during evaluation
    save_strategy="epoch",                # Save the model every epoch
    logging_dir='./logs',                 # Directory for logs
    logging_steps=10,                     # Log every 10 steps
    weight_decay=0.01,                    # Weight decay for regularization
    logging_first_step=True,              # Log the first step
)

# Initialize the custom trainer
trainer = CustomTrainer(
    model=model,                          # The model to train
    args=training_args,                   # Training arguments
    train_dataset=train_dataset,          # Training dataset
    eval_dataset=val_dataset,             # Validation dataset
)

# Start training
trainer.train()


Step,Training Loss
1,0.947
10,0.995
20,1.0257
30,0.8832
40,1.0342
50,0.9938
60,0.9168
70,0.9487
80,0.9075
90,0.7276


Step,Training Loss
1,0.947
10,0.995
20,1.0257
30,0.8832
40,1.0342
50,0.9938
60,0.9168
70,0.9487
80,0.9075
90,0.7276


TrainOutput(global_step=4340, training_loss=0.8654082565263669, metrics={'train_runtime': 3511.136, 'train_samples_per_second': 9.883, 'train_steps_per_second': 1.236, 'total_flos': 9130035595161600.0, 'train_loss': 0.8654082565263669, 'epoch': 10.0})

In [None]:
trainer.evaluate()


{'eval_loss': 0.7059544324874878,
 'eval_runtime': 25.2338,
 'eval_samples_per_second': 34.398,
 'eval_steps_per_second': 4.32,
 'epoch': 10.0}

In [None]:
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [None]:
import shutil
from google.colab import files
shutil.make_archive("model", 'zip', "./model")
files.download("model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.metrics import accuracy_score

# Predict on validation set
preds = trainer.predict(val_dataset).predictions.argmax(axis=-1)

# Access the labels from the val_dataset using the custom dataset class
labels = [item['labels'].item() for item in val_dataset]  # .item() to extract the scalar value from tensor

# Calculate accuracy
accuracy = accuracy_score(labels, preds)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.7765
