**In this notebook we have used the headlines and the main frame of the articles as input and output respectively to train the bert model**

In [None]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-1.1.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.0-py3-none-any.whl (333 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m333.2/333.2 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
Successfully installed accelerate-1.1.0


In [None]:
import pandas as pd
from google.colab import files
upload = files.upload()


Saving Training_Data.csv to Training_Data.csv


In [None]:
!pip install transformers torch




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import numpy as np


In [None]:
df = pd.read_csv('Training_Data.csv')
df.head()

Unnamed: 0,title,first_tone_annotation,first_framing_annotation
0,A Set of Borders to Cross; For Children Seekin...,17.35,10.2
1,LAY OFF THE GUEST WORKER WE WANT,17.35,1.2
2,Would ban on renting to illegals make sense here?,19.35,5.2
3,"Immigrants Pull Weight In Economy, Study Finds",17.35,1.2
4,The Citizenship Surge,17.35,2.2


In [None]:
df = df.dropna(subset=['first_framing_annotation'])

In [None]:
df = df.dropna(subset=['title'])

In [None]:
len(df['first_framing_annotation'].unique())

15

In [None]:
# Shift labels to range from 0 to 14
df['first_framing_annotation'] = df['first_framing_annotation'] - 1


In [None]:
df['first_framing_annotation'].unique()

array([ 9,  0,  4,  1, 10, 12, 11,  6,  3,  5,  7,  8, 13,  2, 14])

In [None]:
import pandas as pd
# Convert column values to integers
df['first_framing_annotation'] = df['first_framing_annotation'].astype(int)

headlines = df['title'].tolist()
frames = df['first_framing_annotation'].tolist()


In [None]:
class NewsDataset(Dataset):
    def __init__(self, headlines, frames, tokenizer, max_length):
        self.headlines = headlines
        self.frames = frames
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        text = self.headlines[idx]
        label = self.frames[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=15)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, labels):
        ce_loss = nn.CrossEntropyLoss()(logits, labels)
        pt = torch.exp(-ce_loss)
        focal_loss = (self.alpha * (1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss


In [None]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(headlines, frames, test_size=0.2, random_state=42)

train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)


In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    weight_decay=0.01,

)

# Custom model with Focal Loss
class CustomBERTModel(nn.Module):
    def __init__(self, model, num_labels, focal_loss):
        super(CustomBERTModel, self).__init__()
        self.model = model
        self.num_labels = num_labels
        self.focal_loss = focal_loss

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = self.focal_loss(logits, labels)
        return SequenceClassifierOutput(loss=loss, logits=logits)

focal_loss = FocalLoss(alpha=0.25, gamma=2)  # Adjust alpha for class imbalance
custom_model = CustomBERTModel(model, num_labels=15, focal_loss=focal_loss)

trainer = Trainer(
    model=custom_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train model
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.4285,0.309466
2,0.2211,0.305375
3,0.1175,0.323467
4,0.0514,0.348479
5,0.0235,0.381463
6,0.0114,0.398033
7,0.0072,0.42517
8,0.0039,0.434463
9,0.0025,0.439315
10,0.0025,0.44592


TrainOutput(global_step=9820, training_loss=0.08317298066713173, metrics={'train_runtime': 18686.1284, 'train_samples_per_second': 2.102, 'train_steps_per_second': 0.526, 'total_flos': 0.0, 'train_loss': 0.08317298066713173, 'epoch': 10.0})

In [None]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.4459196627140045, 'eval_runtime': 123.0177, 'eval_samples_per_second': 7.983, 'eval_steps_per_second': 2.0, 'epoch': 10.0}


In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

# Make predictions on the validation dataset
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)  # Get the predicted class labels
labels = predictions.label_ids                    # True labels

# Calculate micro accuracy
micro_accuracy = accuracy_score(labels, preds)
print(f"Micro Accuracy: {micro_accuracy:.4f}")


Micro Accuracy: 0.4745
