In [25]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

In [29]:
# Load dataset
print("Loading dataset...")
df = pd.read_csv("annotators.csv", encoding="latin1", on_bad_lines="skip")
df = df.dropna()
print(f"Dataset loaded successfully with {len(df)} records.")

Loading dataset...
Dataset loaded successfully with 429 records.


In [30]:
# Display column names to check for correctness
print("Columns in dataset:", df.columns)

Columns in dataset: Index(['id;age;gender;education;native_english_speaker;political_ideology;followed_news_outlets;news_check_frequency;survey_completed'], dtype='object')


In [31]:
# Attempt to rename columns if necessary
expected_columns = {"text": "text", "label": "label"}
for col in df.columns:
    if "text" in col.lower():
        expected_columns["text"] = col
    if "label" in col.lower() or "bias" in col.lower():
        expected_columns["label"] = col
        
df.rename(columns=expected_columns, inplace=True)

In [24]:
# Verify required columns exist
if "text" not in df.columns or "label" not in df.columns:
    raise ValueError("Dataset must contain 'text' and 'label' columns")

print("Dataset preview:")
print(df.head())

ValueError: Dataset must contain 'text' and 'label' columns

In [32]:
# Define Dataset class
class NewsBiasDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [33]:
# Initialize tokenizer and model
print("Initializing tokenizer and model...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
print("Model and tokenizer initialized.")

Initializing tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer initialized.


In [34]:
# Initialize tokenizer and model
print("Initializing tokenizer and model...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
print("Model and tokenizer initialized.")

Initializing tokenizer and model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer initialized.


In [37]:
# Prepare dataset and dataloader
dataset = NewsBiasDataset(df["text"].tolist(), df["label"].tolist(), tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

KeyError: 'text'

In [38]:
# Define training function
def train(model, dataloader, epochs=3, learning_rate=2e-5):
    print("Starting training...")
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss()
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            inputs = {key: batch[key] for key in ["input_ids", "attention_mask"]}
            labels = batch["labels"]
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
    print("Training completed.")


In [39]:
# Define evaluation function
def evaluate(model, dataloader):
    print("Evaluating model...")
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: batch[key] for key in ["input_ids", "attention_mask"]}
            labels = batch["labels"]
            outputs = model(**inputs)
            predictions = torch.argmax(F.softmax(outputs.logits, dim=-1), dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Model Accuracy: {accuracy:.2f}")

In [40]:
# Train and evaluate model
train(model, dataloader)
evaluate(model, dataloader)

NameError: name 'dataloader' is not defined