### Install Dependencies

In [2]:
%pip install ftfy huggingface_hub scikit-learn transformers datasets optuna accelerate==0.27.2 --quiet

Note: you may need to restart the kernel to use updated packages.


## Import Libraries

In [11]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
import random
import numpy as np
import torch

## Load the data

In [None]:
url = "https://raw.githubusercontent.com/VridhiJ/CIS519/refs/heads/main/Dataset/news_urls.csv"

# Load the dataset
df = pd.read_csv(url)

# Display the first few rows to verify the data
df.head()

Unnamed: 0,url
0,https://www.foxnews.com/lifestyle/jack-carrs-e...
1,https://www.foxnews.com/entertainment/bruce-wi...
2,https://www.foxnews.com/politics/blinken-meets...
3,https://www.foxnews.com/entertainment/emily-bl...
4,https://www.foxnews.com/media/the-view-co-host...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     3805 non-null   object
dtypes: object(1)
memory usage: 29.9+ KB


## Headline Collection Method

We collected news headlines by scraping multiple news websites using BeautifulSoup libraries. The scraping process involved:

1. Fetching Webpages:

  - Sending HTTP requests to news article URLs.

  - Using appropriate headers to mimic a real browser and avoid blocking.
    - User-Agent: Identifies the client making request. Helps avoid bot detection by mimicking real browser behavior.
    - Accept-Charset:  Specifies the character encodings that the client can process. Helps ensure proper text rendering.
    - Accept: Defines the type of content the client expects from the server.
    - Accept-Language: Specifies the preferred language for the response content. Helps receive content in a readable format when a website supports multiple languages.
    - referer: Indicates the URL of the page that made the request.
    

2. Extracting Headlines:

  - Parsing the webpage content with BeautifulSoup.

  - Identifying and extracting headlines using H1 tags and class attributes related to headlines.

  - Handling variations in website structures dynamically.

3. Error Handling & Optimization:

  - Implementing error handling to skip unavailable pages.

4. Storing Data:

  - Storing extracted headlines in a structured pandas DataFrame.

 - Saving the data in CSV format for further processing.

This method ensures efficient and scalable data collection while minimizing disruptions caused by website restrictions.

## Data scraping (don't rerun)

In [None]:
%pip install ftfy --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import ftfy
# Helper function to get headline from a single URL
def get_article_headline(url):
  try:
    user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    ]

    session = requests.Session()

    headers = {
    'user-agent': random.choice(user_agents),
    "Accept-Charset": "utf-8",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.9",
    "referer": "https://www.google.com/",
    }
    time.sleep(2)

    response = requests.get(url, headers = headers)

    if response.status_code != 200:
      print(f"Warning: Failed to load page {url} (Status Code: {response.status_code})")
      return None  # Don't stop execution, just return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # To find headline of various types of classes
    headline = soup.find("h1", class_=lambda c: c and "headline" in c)

    if headline:
      headline = ftfy.fix_text(headline.get_text())  # Fix any encoding issues
      return headline.strip()  # Return the cleaned headline
    else:
      return None  # Return None if no headline is found
  except Exception as e:
    print(f"Error processing {url}: {e}")
    return None  # Return None in case of an error

In [None]:
# Create an empty list to store the headlines
headlines = []

# Loop through the URLs in your dataframe
for url in df['url']:
    headline = get_article_headline(url)
    headlines.append(headline)

# Add the scraped headlines to your dataframe
df['headline'] = headlines

# Show the first few rows with the scraped headlines
df.head()

In [None]:
df.to_csv("scraped_headlines.csv", index=False)

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import create_repo

# Create a repository on Hugging Face Hub
repo_name = 'scraped-headlines'
create_repo(repo_name, private=True)

In [None]:
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj='scraped_headlines.csv',
    path_in_repo='scraped_headlines_v4.csv',
    repo_id= 'VridhiJain/scraped-headlines'
)

Do not need to run the code above. Takes too long.

## Import scraped data

In [3]:
import pandas as pd
from huggingface_hub import hf_hub_download

repo_id = "VridhiJain/scraped-headlines"  # repo name
filename = "scraped_headlines_v4.csv"  # file name

# Download the file
file_path = hf_hub_download(repo_id=repo_id, filename=filename)

# Load into a DataFrame
df = pd.read_csv(file_path)

df.head()

scraped_headlines_v4.csv:   0%|          | 0.00/689k [00:00<?, ?B/s]

Unnamed: 0,url,headline
0,https://www.foxnews.com/lifestyle/jack-carrs-e...,Jack Carr recalls Gen. Eisenhower's D-Day memo...
1,https://www.foxnews.com/entertainment/bruce-wi...,"Bruce Willis, Demi Moore avoided doing one thi..."
2,https://www.foxnews.com/politics/blinken-meets...,
3,https://www.foxnews.com/entertainment/emily-bl...,Emily Blunt says her 'toes curl' when people t...
4,https://www.foxnews.com/media/the-view-co-host...,"'The View' co-host, CNN commentator Ana Navarr..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       3805 non-null   object
 1   headline  3352 non-null   object
dtypes: object(2)
memory usage: 59.6+ KB


## Data cleaning

- drop rows with missing headlines.

In [6]:
# Check for missing values in the dataset
print(df.isnull().sum())

# Drop any rows where the headline is missing
df = df.dropna(subset=['headline']).drop_duplicates(subset=['headline'])

# Reset index after dropping rows
df = df.reset_index(drop=True)

url         0
headline    0
dtype: int64


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3336 entries, 0 to 3335
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       3336 non-null   object
 1   headline  3336 non-null   object
dtypes: object(2)
memory usage: 52.3+ KB


In [8]:
df['url'].str.contains('foxnews').value_counts()

url
False    1779
True     1557
Name: count, dtype: int64

Fox News Headlines: 1779

NBC News Headlines: 1557

**need to have metric charts; also, if accuracy isn't high enough (85% for roberta, maybe add more epochs**

# Baseline Model (TF-IDF + Log Regression)

In [12]:
# For reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [13]:
%pip install scikit-learn huggingface_hub --quiet

Note: you may need to restart the kernel to use updated packages.


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the preprocessed headline data from Hugging Face
from huggingface_hub import hf_hub_download
csv_path = hf_hub_download(repo_id="VridhiJain/scraped-headlines", filename="scraped_headlines_v4.csv")
df = pd.read_csv(csv_path)

# Drop rows with missing headlines
df = df.dropna(subset=['headline'])

# Label: 1 for FoxNews, 0 for NBC
df['label'] = df['url'].apply(lambda x: 1 if "foxnews" in x else 0)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['label'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression
baseline_model = LogisticRegression(max_iter=100)
baseline_model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = baseline_model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6960
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.63      0.68       344
           1       0.66      0.76      0.71       327

    accuracy                           0.70       671
   macro avg       0.70      0.70      0.70       671
weighted avg       0.70      0.70      0.69       671



# Bert-based Classifier

In [2]:
%pip install transformers datasets accelerate==0.27.2 --quiet

Note: you may need to restart the kernel to use updated packages.


In [16]:
import os
import torch
from sklearn.metrics import precision_recall_fscore_support
from datasets import Dataset
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments)

In [17]:
df['label'] = df['url'].apply(lambda x: 1 if "foxnews" in x.lower() else 0)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_dataset = Dataset.from_pandas(train_df[['headline', 'label']])
test_dataset = Dataset.from_pandas(test_df[['headline', 'label']])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["headline"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Load model
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = torch.argmax(torch.tensor(pred.predictions), axis=1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return{"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate()
print("BERT Evaluation Results:")
print(results)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2681 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.445437,0.81073,0.780656,0.846442,0.724359
2,No log,0.420406,0.825633,0.797927,0.865169,0.740385


In [8]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
from huggingface_hub import HfFolder

HfFolder.save_token() 

In [16]:
model_path = "bert_vanilla"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('bert_vanilla/tokenizer_config.json',
 'bert_vanilla/special_tokens_map.json',
 'bert_vanilla/vocab.txt',
 'bert_vanilla/added_tokens.json',
 'bert_vanilla/tokenizer.json')

In [17]:
from huggingface_hub import HfApi

api = HfApi()

# Replace with your desired repo name
repo_name = "bert_vanilla"

# This creates a repo under your namespace (username)
api.create_repo(repo_id=repo_name, private=False, exist_ok=True)

RepoUrl('https://huggingface.co/VridhiJain/bert_vanilla', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/bert_vanilla')

In [18]:
# Push model and tokenizer
trainer.model.push_to_hub("bert_vanilla")
tokenizer.push_to_hub("bert_vanilla")


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/VridhiJain/bert_vanilla/commit/46efe04e666afd96b543228f219e03c4a64a3ff3', commit_message='Upload tokenizer', commit_description='', oid='46efe04e666afd96b543228f219e03c4a64a3ff3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/VridhiJain/bert_vanilla', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/bert_vanilla'), pr_revision=None, pr_num=None)

# RoBERTa-based Classifier

In [12]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer_roberta = RobertaTokenizer.from_pretrained("roberta-base")
train_dataset = Dataset.from_pandas(train_df[["headline", "label"]])
test_dataset = Dataset.from_pandas(test_df[["headline", "label"]])

def tokenize_roberta(batch):
    return tokenizer_roberta(batch["headline"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_roberta, batched=True)
test_dataset = test_dataset.map(tokenize_roberta, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args_roberta = TrainingArguments(
    output_dir="./roberta_results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)

trainer_roberta = Trainer(
    model=roberta_model,
    args=training_args_roberta,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer_roberta.train()
roberta_results = trainer_roberta.evaluate()
print("RoBERTa Evaluation Results:", roberta_results)

trainer_roberta.save_model("./roberta_model")

Map:   0%|          | 0/2648 [00:00<?, ? examples/s]

Map:   0%|          | 0/663 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5706,0.420179,0.812971,0.807453,0.766962,0.852459
2,0.327,0.399927,0.844646,0.83252,0.825806,0.839344
3,0.2041,0.457025,0.838612,0.811287,0.877863,0.754098


RoBERTa Evaluation Results: {'eval_loss': 0.4570247232913971, 'eval_accuracy': 0.8386123680241327, 'eval_f1': 0.8112874779541446, 'eval_precision': 0.8778625954198473, 'eval_recall': 0.7540983606557377, 'eval_runtime': 6.9864, 'eval_samples_per_second': 94.899, 'eval_steps_per_second': 6.012, 'epoch': 3.0}


In [10]:
model_path = "roberta_vanilla"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('roberta_vanilla/tokenizer_config.json',
 'roberta_vanilla/special_tokens_map.json',
 'roberta_vanilla/vocab.txt',
 'roberta_vanilla/added_tokens.json',
 'roberta_vanilla/tokenizer.json')

In [12]:
from huggingface_hub import HfApi
api = HfApi()

# Replace with your desired repo name
repo_name = "roberta_vanilla"

# This creates a repo under your namespace (username)
api.create_repo(repo_id=repo_name, private=False, exist_ok=True)

RepoUrl('https://huggingface.co/VridhiJain/roberta_vanilla', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/roberta_vanilla')

In [13]:
# Push model and tokenizer
trainer.model.push_to_hub("roberta_vanilla")
tokenizer.push_to_hub("roberta_vanilla")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/VridhiJain/roberta_vanilla/commit/08c44f399c43d358b680858804a534e1319d3aee', commit_message='Upload tokenizer', commit_description='', oid='08c44f399c43d358b680858804a534e1319d3aee', pr_url=None, repo_url=RepoUrl('https://huggingface.co/VridhiJain/roberta_vanilla', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/roberta_vanilla'), pr_revision=None, pr_num=None)

# Bert-based Classifier - Hyperparameter Tuning (don't rerun)

In [14]:
from sklearn.model_selection import train_test_split

df['label'] = df['url'].apply(lambda x: 1 if "foxnews" in x.lower() else 0)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_dataset = Dataset.from_pandas(train_df[['headline', 'label']])
test_dataset = Dataset.from_pandas(test_df[['headline', 'label']])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["headline"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/2648 [00:00<?, ? examples/s]

Map:   0%|          | 0/663 [00:00<?, ? examples/s]

In [15]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = torch.argmax(torch.tensor(pred.predictions), axis=1).numpy()
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
  acc = accuracy_score(labels, preds)
  return{"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

## Grid Search

In [None]:
from transformers import (AutoModelForSequenceClassification, Trainer, TrainingArguments)
import numpy as np

# define search space
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5]
epochs = [3, 4, 5]
weight_decays = [0.01, 0.001]

best_f1 = 0
best_config = {}

for lr in learning_rates:
  for num_epochs in epochs:
    for wd in weight_decays:
      print(f"\nTraining with lr={lr}, epochs={num_epochs}, weight_decay={wd}")

      training_args = TrainingArguments(
        output_dir=f"./bert_tuned_lr{lr}_ep{num_epochs}_wd{wd}",
        eval_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=wd,
        report_to="none"
      )

      model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

      trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
      )

      trainer.train()
      eval_results = trainer.evaluate()
      print("F1 score:", eval_results['eval_f1'])

      if eval_results['eval_f1'] > best_f1:
        best_f1 = eval_results['eval_f1']
        best_config = {
            "learning_rate": lr,
            "num_epochs": num_epochs,
            "weight_decay": wd,
            "eval_results": eval_results
        }

print("\nBest configuration:")
print(best_config)


Training with lr=1e-05, epochs=3, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.530149,0.740573,0.733746,0.695015,0.777049
2,No log,0.474489,0.773756,0.744898,0.773852,0.718033
3,No log,0.461793,0.791855,0.762069,0.803636,0.72459


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1 score: 0.7620689655172413

Training with lr=1e-05, epochs=3, weight_decay=0.001


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.477849,0.782805,0.748252,0.801498,0.701639
2,No log,0.4705,0.785822,0.743682,0.827309,0.67541
3,No log,0.449494,0.809955,0.784247,0.820789,0.75082


F1 score: 0.7842465753424658

Training with lr=1e-05, epochs=4, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.465809,0.791855,0.762069,0.803636,0.72459
2,No log,0.454435,0.794872,0.758007,0.828794,0.698361
3,No log,0.465762,0.79638,0.755877,0.842742,0.685246
4,0.431600,0.457268,0.809955,0.776596,0.84556,0.718033


F1 score: 0.776595744680851

Training with lr=1e-05, epochs=4, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.526315,0.737557,0.67658,0.781116,0.596721
2,No log,0.448838,0.794872,0.771044,0.792388,0.75082
3,No log,0.439395,0.806938,0.783784,0.808362,0.760656
4,0.476400,0.442901,0.81448,0.788296,0.82971,0.75082


F1 score: 0.7882960413080895

Training with lr=1e-05, epochs=5, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.516714,0.746606,0.698925,0.770751,0.639344
2,No log,0.471526,0.784314,0.7712,0.753125,0.790164
3,No log,0.436923,0.817496,0.791738,0.833333,0.754098
4,0.469500,0.452491,0.817496,0.794567,0.823944,0.767213
5,0.469500,0.466154,0.81448,0.791171,0.820423,0.763934


F1 score: 0.7911714770797963

Training with lr=1e-05, epochs=5, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.522296,0.757164,0.734761,0.738411,0.731148
2,No log,0.474571,0.781297,0.768,0.75,0.786885
3,No log,0.45458,0.811463,0.769797,0.878151,0.685246
4,0.469700,0.449717,0.822021,0.795139,0.845018,0.75082
5,0.469700,0.447496,0.829563,0.811352,0.826531,0.796721


F1 score: 0.8113522537562604

Training with lr=2e-05, epochs=3, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.458349,0.790347,0.772504,0.771242,0.77377
2,No log,0.429312,0.808446,0.798092,0.774691,0.822951
3,No log,0.439898,0.820513,0.789381,0.857692,0.731148


F1 score: 0.7893805309734513

Training with lr=2e-05, epochs=3, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.446684,0.806938,0.777778,0.826568,0.734426
2,No log,0.425809,0.831071,0.810811,0.836237,0.786885
3,No log,0.462641,0.837104,0.814433,0.855596,0.777049


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1 score: 0.8144329896907216

Training with lr=2e-05, epochs=4, weight_decay=0.01


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.438124,0.808446,0.777583,0.834586,0.727869
2,No log,0.40779,0.828054,0.816129,0.803175,0.829508
3,No log,0.501908,0.832579,0.80354,0.873077,0.744262
4,0.361000,0.551832,0.825038,0.797909,0.851301,0.75082


F1 score: 0.7979094076655052

Training with lr=2e-05, epochs=4, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.440777,0.791855,0.768456,0.786942,0.75082
2,No log,0.431811,0.826546,0.805415,0.832168,0.780328
3,No log,0.52877,0.811463,0.78185,0.835821,0.734426
4,0.363100,0.637383,0.819005,0.793103,0.836364,0.754098


F1 score: 0.7931034482758621

Training with lr=2e-05, epochs=5, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.440504,0.79638,0.775374,0.787162,0.763934
2,No log,0.421668,0.817496,0.800659,0.804636,0.796721
3,No log,0.549829,0.820513,0.797274,0.829787,0.767213
4,0.354400,0.757735,0.825038,0.800687,0.841155,0.763934
5,0.354400,0.815299,0.826546,0.80737,0.825342,0.790164


F1 score: 0.8073701842546064

Training with lr=2e-05, epochs=5, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.438154,0.799397,0.791209,0.759036,0.82623
2,No log,0.436764,0.820513,0.81493,0.775148,0.859016
3,No log,0.58249,0.817496,0.77634,0.889831,0.688525
4,0.371800,0.679864,0.829563,0.804836,0.850365,0.763934
5,0.371800,0.761189,0.822021,0.793706,0.850187,0.744262


F1 score: 0.7937062937062938

Training with lr=3e-05, epochs=3, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.410588,0.811463,0.784111,0.828467,0.744262
2,No log,0.438658,0.828054,0.797153,0.871595,0.734426
3,No log,0.565008,0.826546,0.793537,0.876984,0.72459


F1 score: 0.7935368043087971

Training with lr=3e-05, epochs=3, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.413913,0.815988,0.805732,0.783282,0.829508
2,No log,0.456278,0.825038,0.797909,0.851301,0.75082
3,No log,0.567989,0.828054,0.801394,0.855019,0.754098


F1 score: 0.8013937282229965

Training with lr=3e-05, epochs=4, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.417676,0.817496,0.810047,0.777108,0.845902
2,No log,0.40266,0.831071,0.818182,0.810289,0.82623
3,No log,0.628311,0.831071,0.800712,0.875486,0.737705
4,0.307100,0.70262,0.844646,0.82087,0.874074,0.77377


F1 score: 0.8208695652173913

Training with lr=3e-05, epochs=4, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.417169,0.815988,0.797342,0.808081,0.786885
2,No log,0.471792,0.81448,0.81106,0.763006,0.865574
3,No log,0.663276,0.826546,0.795737,0.868217,0.734426
4,0.316000,0.73443,0.822021,0.797251,0.837545,0.760656


F1 score: 0.7972508591065293

Training with lr=3e-05, epochs=5, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.413925,0.820513,0.801997,0.814189,0.790164
2,No log,0.420093,0.828054,0.810631,0.821549,0.8
3,No log,0.738285,0.825038,0.785185,0.902128,0.695082
4,0.309000,0.862537,0.825038,0.795775,0.859316,0.740984
5,0.309000,0.951828,0.820513,0.797274,0.829787,0.767213


F1 score: 0.797274275979557

Training with lr=3e-05, epochs=5, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.399572,0.822021,0.8,0.82807,0.77377
2,No log,0.422495,0.831071,0.821656,0.798762,0.845902
3,No log,0.759811,0.811463,0.765478,0.894737,0.668852
4,0.314000,0.88366,0.834087,0.802158,0.888446,0.731148
5,0.314000,0.888654,0.832579,0.809605,0.848921,0.77377


F1 score: 0.8096054888507719

Training with lr=5e-05, epochs=3, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.399813,0.819005,0.785714,0.862745,0.721311
2,No log,0.533225,0.822021,0.802013,0.821306,0.783607
3,No log,0.765737,0.81448,0.786087,0.837037,0.740984


F1 score: 0.7860869565217391

Training with lr=5e-05, epochs=3, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.404507,0.828054,0.817891,0.797508,0.839344
2,No log,0.491427,0.828054,0.814332,0.809061,0.819672
3,No log,0.726552,0.826546,0.800693,0.849265,0.757377


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1 score: 0.8006932409012132

Training with lr=5e-05, epochs=4, weight_decay=0.01


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.399962,0.831071,0.815789,0.818482,0.813115
2,No log,0.397817,0.823529,0.81399,0.790123,0.839344
3,No log,0.753987,0.819005,0.802632,0.805281,0.8
4,0.300900,0.854951,0.81448,0.791878,0.818182,0.767213


F1 score: 0.7918781725888325

Training with lr=5e-05, epochs=4, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.434113,0.812971,0.789831,0.817544,0.763934
2,No log,0.545909,0.834087,0.816667,0.830508,0.803279
3,No log,0.748522,0.828054,0.810631,0.821549,0.8
4,0.289800,0.861147,0.825038,0.8,0.843636,0.760656


F1 score: 0.8

Training with lr=5e-05, epochs=5, weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.405308,0.81448,0.782301,0.85,0.72459
2,No log,0.548331,0.826546,0.802065,0.844203,0.763934
3,No log,0.768244,0.828054,0.8125,0.815182,0.809836
4,0.275900,0.903819,0.832579,0.824645,0.795732,0.855738
5,0.275900,0.908338,0.846154,0.829431,0.846416,0.813115


F1 score: 0.8294314381270903

Training with lr=5e-05, epochs=5, weight_decay=0.001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.43536,0.811463,0.796748,0.790323,0.803279
2,No log,0.507169,0.819005,0.797297,0.8223,0.77377
3,No log,0.741862,0.829563,0.808799,0.835664,0.783607
4,0.323800,0.90194,0.841629,0.820513,0.857143,0.786885
5,0.323800,0.946775,0.838612,0.817094,0.853571,0.783607


F1 score: 0.8170940170940171

Best configuration:
{'learning_rate': 5e-05, 'num_epochs': 5, 'weight_decay': 0.01, 'eval_results': {'eval_loss': 0.9083382487297058, 'eval_accuracy': 0.8461538461538461, 'eval_f1': 0.8294314381270903, 'eval_precision': 0.8464163822525598, 'eval_recall': 0.8131147540983606, 'eval_runtime': 4.4751, 'eval_samples_per_second': 148.153, 'eval_steps_per_second': 9.385, 'epoch': 5.0}}


### Bert-based Classifier retrained w/ best hyperparameters (from grid search)

Best configuration (best F1):
- Learning rate: 5e-05 \\
- Num epochs: 5 \\
- Weight decay: 0.01

In [16]:
# Load model
bert_gridsearch_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_results_best_gridsearch",
    eval_strategy="epoch",
    learning_rate=5e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)

trainer = Trainer(
    model=bert_gridsearch_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate()
print("BERT Evaluation Results:")
print(results)

trainer.save_model("./bert_gridsearch_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.440608,0.802413,0.801815,0.744382,0.868852
2,No log,0.467455,0.81448,0.798691,0.797386,0.8
3,No log,0.79663,0.823529,0.786106,0.88843,0.704918
4,0.299000,0.920148,0.841629,0.820513,0.857143,0.786885
5,0.299000,1.003066,0.831071,0.801418,0.872587,0.740984


BERT Evaluation Results:
{'eval_loss': 1.0030657052993774, 'eval_accuracy': 0.8310708898944194, 'eval_f1': 0.8014184397163121, 'eval_precision': 0.8725868725868726, 'eval_recall': 0.740983606557377, 'eval_runtime': 7.1596, 'eval_samples_per_second': 92.603, 'eval_steps_per_second': 5.866, 'epoch': 5.0}


In [17]:
model_path = "bert_gridsearch"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

api = HfApi()

# Replace with your desired repo name
repo_name = "bert_gridsearch"

# This creates a repo under your namespace (username)
api.create_repo(repo_id=repo_name, private=False, exist_ok=True)

# Push model and tokenizer
trainer.model.push_to_hub("bert_gridsearch")
tokenizer.push_to_hub("bert_gridsearch")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/VridhiJain/bert_gridsearch/commit/43a62013a9f756d6b747baf4e248584e775ccb20', commit_message='Upload tokenizer', commit_description='', oid='43a62013a9f756d6b747baf4e248584e775ccb20', pr_url=None, repo_url=RepoUrl('https://huggingface.co/VridhiJain/bert_gridsearch', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/bert_gridsearch'), pr_revision=None, pr_num=None)

## Bayesian Optimization (using optuna)

In [19]:
%pip install optuna --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [20]:
import optuna

In [21]:
def model_init():
  return AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

def objective(trial):
  # hyperparameter search space
  learning_rate = trial.suggest_float("learning_rate", 1e-6, 5e-5, log=True)
  weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
  batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
  num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)

  args = TrainingArguments(
    output_dir=f"./bert_bayesian_tuned_lr{learning_rate}_ep{num_train_epochs}_wd{weight_decay}",
    eval_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    report_to="none"
  )

  trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
  )

  trainer.train()
  eval_result = trainer.evaluate()
  return eval_result["eval_f1"]

In [None]:
# run optimization loop
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

[I 2025-04-17 02:07:02,951] A new study created in memory with name: no-name-01e54766-35de-4974-8915-9049b973ae16
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.519513,0.745098,0.709122,0.746377,0.67541
2,No log,0.487948,0.763198,0.721137,0.786822,0.665574


[I 2025-04-17 02:09:15,667] Trial 0 finished with value: 0.7211367673179396 and parameters: {'learning_rate': 9.233775118225436e-06, 'weight_decay': 0.00953859115750858, 'batch_size': 16, 'num_train_epochs': 2}. Best is trial 0 with value: 0.7211367673179396.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.554282,0.737557,0.704082,0.731449,0.678689
2,No log,0.475494,0.778281,0.752941,0.772414,0.734426
3,No log,0.468597,0.767722,0.725979,0.793774,0.668852
4,0.521600,0.460345,0.785822,0.760135,0.783972,0.737705
5,0.521600,0.463599,0.782805,0.755102,0.784452,0.727869


[I 2025-04-17 02:14:39,249] Trial 1 finished with value: 0.7551020408163265 and parameters: {'learning_rate': 5.3632495466033565e-06, 'weight_decay': 0.13155005194235506, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 1 with value: 0.7551020408163265.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.506748,0.773756,0.740484,0.783883,0.701639
2,No log,0.444823,0.793363,0.768971,0.791667,0.747541
3,No log,0.476681,0.785822,0.734082,0.855895,0.642623
4,0.466600,0.467514,0.811463,0.789916,0.810345,0.770492
5,0.466600,0.477549,0.808446,0.782161,0.820144,0.747541


[I 2025-04-17 02:20:00,399] Trial 2 finished with value: 0.7821612349914236 and parameters: {'learning_rate': 8.268633724650536e-06, 'weight_decay': 0.23589383073593467, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 2 with value: 0.7821612349914236.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.541233,0.728507,0.60177,0.92517,0.445902
2,0.465000,0.554543,0.844646,0.83087,0.832237,0.829508
3,0.465000,0.785515,0.850679,0.836364,0.843333,0.829508
4,0.149800,0.894208,0.846154,0.828859,0.848797,0.809836


[I 2025-04-17 02:25:14,309] Trial 3 finished with value: 0.8288590604026845 and parameters: {'learning_rate': 4.733978326237281e-05, 'weight_decay': 0.13720158843680744, 'batch_size': 8, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.669759,0.618401,0.349614,0.809524,0.222951
2,No log,0.653654,0.656109,0.46729,0.813008,0.327869


[I 2025-04-17 02:27:35,741] Trial 4 finished with value: 0.4672897196261682 and parameters: {'learning_rate': 2.3087079307124286e-06, 'weight_decay': 0.04312698926727332, 'batch_size': 16, 'num_train_epochs': 2}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.628808,0.686275,0.578947,0.756614,0.468852
2,No log,0.564449,0.736048,0.700855,0.732143,0.672131
3,No log,0.545105,0.739065,0.69808,0.746269,0.655738


[I 2025-04-17 02:30:56,048] Trial 5 finished with value: 0.6980802792321117 and parameters: {'learning_rate': 3.667204183797974e-06, 'weight_decay': 0.1905418669824964, 'batch_size': 16, 'num_train_epochs': 3}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.411414,0.815988,0.791096,0.827957,0.757377
2,No log,0.398,0.834087,0.814815,0.83737,0.793443
3,No log,0.549215,0.823529,0.789946,0.873016,0.721311


[I 2025-04-17 02:33:52,279] Trial 6 finished with value: 0.7899461400359067 and parameters: {'learning_rate': 4.472168243319721e-05, 'weight_decay': 0.1777656790574175, 'batch_size': 32, 'num_train_epochs': 3}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.528731,0.740573,0.671756,0.803653,0.577049
2,0.573700,0.486468,0.775264,0.737213,0.79771,0.685246


[I 2025-04-17 02:36:34,240] Trial 7 finished with value: 0.7372134038800705 and parameters: {'learning_rate': 7.625894234893907e-06, 'weight_decay': 0.04647826257975827, 'batch_size': 8, 'num_train_epochs': 2}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.555106,0.731523,0.695205,0.727599,0.665574
2,No log,0.48107,0.770739,0.736111,0.782288,0.695082
3,No log,0.467197,0.781297,0.743363,0.807692,0.688525
4,0.524200,0.461461,0.779789,0.746528,0.793358,0.704918


[I 2025-04-17 02:41:03,618] Trial 8 finished with value: 0.7465277777777778 and parameters: {'learning_rate': 5.833301270935555e-06, 'weight_decay': 0.2693765179850381, 'batch_size': 16, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.66855,0.604827,0.379147,0.683761,0.262295
2,No log,0.622181,0.68175,0.630473,0.676692,0.590164
3,No log,0.582254,0.698341,0.658703,0.686833,0.632787
4,No log,0.56101,0.713424,0.682274,0.696246,0.668852
5,No log,0.553269,0.72549,0.693603,0.712803,0.67541


[I 2025-04-17 02:45:49,830] Trial 9 finished with value: 0.6936026936026936 and parameters: {'learning_rate': 4.082487305718032e-06, 'weight_decay': 0.14721386023663655, 'batch_size': 32, 'num_train_epochs': 5}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.484692,0.782805,0.71875,0.888889,0.603279
2,0.462800,0.699817,0.828054,0.798587,0.8659,0.740984
3,0.462800,0.853045,0.823529,0.809135,0.805195,0.813115
4,0.159300,1.040766,0.825038,0.796491,0.856604,0.744262


[I 2025-04-17 02:51:07,152] Trial 10 finished with value: 0.7964912280701755 and parameters: {'learning_rate': 4.47406592791207e-05, 'weight_decay': 0.09147057073010023, 'batch_size': 8, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.566684,0.763198,0.668076,0.940476,0.518033
2,0.420800,0.790276,0.843137,0.831169,0.823151,0.839344
3,0.420800,1.015154,0.831071,0.803509,0.864151,0.75082
4,0.128700,1.062271,0.838612,0.819562,0.84375,0.796721


[I 2025-04-17 02:56:27,005] Trial 11 finished with value: 0.8195615514333895 and parameters: {'learning_rate': 4.769680622006179e-05, 'weight_decay': 0.0921683926840284, 'batch_size': 8, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.454268,0.794872,0.734375,0.908213,0.616393
2,0.440800,0.607697,0.820513,0.805873,0.801948,0.809836
3,0.440800,0.864472,0.826546,0.806071,0.829861,0.783607
4,0.168900,0.983888,0.812971,0.780919,0.846743,0.72459


[I 2025-04-17 03:01:51,320] Trial 12 finished with value: 0.7809187279151943 and parameters: {'learning_rate': 2.2145142777398132e-05, 'weight_decay': 0.09047522704707901, 'batch_size': 8, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.465764,0.791855,0.731518,0.899522,0.616393
2,0.441400,0.606604,0.835596,0.818636,0.831081,0.806557
3,0.441400,0.779302,0.843137,0.824324,0.850174,0.8
4,0.183100,0.932919,0.819005,0.786477,0.859922,0.72459


[I 2025-04-17 03:07:19,007] Trial 13 finished with value: 0.7864768683274022 and parameters: {'learning_rate': 2.025149470348353e-05, 'weight_decay': 0.10534361585467678, 'batch_size': 8, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8288590604026845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.677565,0.582202,0.351288,0.614754,0.245902
2,0.685600,0.66166,0.648567,0.517598,0.702247,0.409836
3,0.685600,0.65483,0.666667,0.574181,0.696262,0.488525


[I 2025-04-17 03:11:20,803] Trial 14 finished with value: 0.5741811175337187 and parameters: {'learning_rate': 1.0973611168209234e-06, 'weight_decay': 0.19029544907832124, 'batch_size': 8, 'num_train_epochs': 3}. Best is trial 3 with value: 0.8288590604026845.


In [None]:
print("Best trial:")
print(study.best_trial)

Best trial:
FrozenTrial(number=3, state=1, values=[0.8288590604026845], datetime_start=datetime.datetime(2025, 4, 17, 2, 20, 0, 399900), datetime_complete=datetime.datetime(2025, 4, 17, 2, 25, 14, 309105), params={'learning_rate': 4.733978326237281e-05, 'weight_decay': 0.13720158843680744, 'batch_size': 8, 'num_train_epochs': 4}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-06, step=None), 'weight_decay': FloatDistribution(high=0.3, log=False, low=0.0, step=None), 'batch_size': CategoricalDistribution(choices=(8, 16, 32)), 'num_train_epochs': IntDistribution(high=5, log=False, low=2, step=1)}, trial_id=3, value=None)


### Bert-based Classifier retrained w/ best hyperparameters (from Bayesian optimization)

Best configuration (best F1):
- Learning rate: 4.733978326237281e-05
- Weight decay: 0.13720158843680744
- Batch size: 8
- Num epochs: 4

In [22]:
# Load model
bert_bayesian_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_results_best_bayesian",
    eval_strategy="epoch",
    learning_rate=4.733978326237281e-05,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.13720158843680744,
    report_to="none"
)

trainer = Trainer(
    model=bert_bayesian_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate()
print("BERT Evaluation Results:")
print(results)

trainer.save_model("./bert_bayesian_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.534273,0.769231,0.695825,0.883838,0.57377
2,0.460700,0.644157,0.825038,0.790614,0.879518,0.718033
3,0.460700,0.806812,0.838612,0.821963,0.834459,0.809836
4,0.151500,0.920535,0.844646,0.823932,0.860714,0.790164


BERT Evaluation Results:
{'eval_loss': 0.9205345511436462, 'eval_accuracy': 0.8446455505279035, 'eval_f1': 0.8239316239316239, 'eval_precision': 0.8607142857142858, 'eval_recall': 0.7901639344262295, 'eval_runtime': 7.608, 'eval_samples_per_second': 87.145, 'eval_steps_per_second': 10.91, 'epoch': 4.0}


In [23]:
model_path = "bert_bayesian"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

api = HfApi()

# Replace with your desired repo name
repo_name = "bert_bayesian"

# This creates a repo under your namespace (username)
api.create_repo(repo_id=repo_name, private=False, exist_ok=True)

# Push model and tokenizer
trainer.model.push_to_hub("bert_bayesian")
tokenizer.push_to_hub("bert_bayesian")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/VridhiJain/bert_bayesian/commit/61cabfe169da2cb47afa7ef5c07d4445497c0846', commit_message='Upload tokenizer', commit_description='', oid='61cabfe169da2cb47afa7ef5c07d4445497c0846', pr_url=None, repo_url=RepoUrl('https://huggingface.co/VridhiJain/bert_bayesian', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/bert_bayesian'), pr_revision=None, pr_num=None)

# RoBERTa-based Classifier - Hyperparameter Tuning (don't rerun)

In [13]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import numpy as np

tokenizer_roberta = RobertaTokenizer.from_pretrained("roberta-base")
train_dataset = Dataset.from_pandas(train_df[["headline", "label"]])
test_dataset = Dataset.from_pandas(test_df[["headline", "label"]])

def tokenize_roberta(batch):
    return tokenizer_roberta(batch["headline"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_roberta, batched=True)
test_dataset = test_dataset.map(tokenize_roberta, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/2648 [00:00<?, ? examples/s]

Map:   0%|          | 0/663 [00:00<?, ? examples/s]

## Grid Search

In [None]:
# define search space
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5]
epochs = [3, 5]
weight_decays = [0.01, 0.001]

best_f1 = 0
best_config = {}

for lr in learning_rates:
  for num_epochs in epochs:
    for wd in weight_decays:
      print(f"\nTraining with lr={lr}, epochs={num_epochs}, weight_decay={wd}")

      training_args = TrainingArguments(
        output_dir=f"./roberta_tuned_lr{lr}_ep{num_epochs}_wd{wd}",
        eval_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=wd,
        report_to="none",
        logging_strategy="epoch"
      )

      model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

      trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
      )

      trainer.train()
      eval_results = trainer.evaluate()
      print("F1 score:", eval_results['eval_f1'])

      if eval_results['eval_f1'] > best_f1:
        best_f1 = eval_results['eval_f1']
        best_config = {
            "learning_rate": lr,
            "num_epochs": num_epochs,
            "weight_decay": wd,
            "eval_results": eval_results
        }

print("\nBest configuration:")
print(best_config)


Training with lr=1e-05, epochs=3, weight_decay=0.01


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.569,0.431239,0.819005,0.79798,0.820069,0.777049
2,0.3667,0.421394,0.826546,0.790528,0.889344,0.711475
3,0.2645,0.412142,0.832579,0.804921,0.867424,0.75082


F1 score: 0.804920913884007

Training with lr=1e-05, epochs=3, weight_decay=0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5952,0.454104,0.808446,0.792822,0.788961,0.796721
2,0.4043,0.429782,0.820513,0.787879,0.863281,0.72459
3,0.2912,0.433335,0.815988,0.783688,0.853282,0.72459


F1 score: 0.7836879432624113

Training with lr=1e-05, epochs=5, weight_decay=0.01


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.591,0.452077,0.806938,0.797468,0.770642,0.82623
2,0.3921,0.417992,0.834087,0.828125,0.791045,0.868852
3,0.2773,0.476095,0.828054,0.789668,0.902954,0.701639
4,0.203,0.471046,0.843137,0.822526,0.857651,0.790164
5,0.1565,0.540692,0.834087,0.804965,0.876448,0.744262


F1 score: 0.8049645390070922

Training with lr=1e-05, epochs=5, weight_decay=0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5792,0.437039,0.799397,0.786517,0.77044,0.803279
2,0.3446,0.396633,0.835596,0.821018,0.822368,0.819672
3,0.2374,0.48922,0.828054,0.795699,0.87747,0.727869
4,0.1746,0.523724,0.840121,0.819113,0.854093,0.786885
5,0.1331,0.551813,0.843137,0.821306,0.862816,0.783607


F1 score: 0.8213058419243986

Training with lr=2e-05, epochs=3, weight_decay=0.01


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5437,0.400045,0.809955,0.796774,0.784127,0.809836
2,0.3036,0.405654,0.853695,0.834188,0.871429,0.8
3,0.1785,0.448327,0.853695,0.836975,0.858621,0.816393


F1 score: 0.8369747899159664

Training with lr=2e-05, epochs=3, weight_decay=0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5562,0.418668,0.806938,0.803077,0.756522,0.855738
2,0.3168,0.422653,0.837104,0.827476,0.806854,0.84918
3,0.203,0.426743,0.834087,0.807018,0.867925,0.754098


F1 score: 0.8070175438596491

Training with lr=2e-05, epochs=5, weight_decay=0.01


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5594,0.439889,0.790347,0.795287,0.721925,0.885246
2,0.3205,0.407966,0.838612,0.815199,0.861314,0.77377
3,0.1996,0.511333,0.844646,0.818981,0.882576,0.763934
4,0.1314,0.617112,0.861237,0.84083,0.89011,0.796721
5,0.0815,0.736664,0.852187,0.831034,0.876364,0.790164


F1 score: 0.8310344827586207

Training with lr=2e-05, epochs=5, weight_decay=0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5466,0.406107,0.819005,0.807692,0.789969,0.82623
2,0.3059,0.371309,0.855204,0.836177,0.871886,0.803279
3,0.1852,0.666547,0.831071,0.791822,0.914163,0.698361
4,0.1209,0.653437,0.868778,0.852292,0.883803,0.822951
5,0.0661,0.710193,0.874811,0.858603,0.893617,0.82623


F1 score: 0.858603066439523

Training with lr=3e-05, epochs=3, weight_decay=0.01


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5396,0.387755,0.838612,0.820771,0.839041,0.803279
2,0.3168,0.437602,0.837104,0.809859,0.874525,0.754098
3,0.1576,0.461244,0.865762,0.85042,0.872414,0.829508


F1 score: 0.8504201680672269

Training with lr=3e-05, epochs=3, weight_decay=0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5467,0.3981,0.826546,0.813614,0.804487,0.822951
2,0.3138,0.39654,0.846154,0.828859,0.848797,0.809836
3,0.1663,0.568183,0.850679,0.831919,0.862676,0.803279


F1 score: 0.831918505942275

Training with lr=3e-05, epochs=5, weight_decay=0.01


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5529,0.419053,0.822021,0.811502,0.791277,0.832787
2,0.3399,0.353683,0.846154,0.826531,0.858657,0.796721
3,0.2044,0.495005,0.852187,0.84345,0.82243,0.865574
4,0.1185,0.751786,0.841629,0.817391,0.87037,0.770492
5,0.0664,0.825741,0.841629,0.818024,0.867647,0.77377


F1 score: 0.8180242634315424

Training with lr=3e-05, epochs=5, weight_decay=0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5374,0.39815,0.820513,0.783242,0.881148,0.704918
2,0.3139,0.408379,0.84917,0.829352,0.864769,0.796721
3,0.171,0.564832,0.856712,0.844007,0.845395,0.842623
4,0.1133,0.742153,0.84917,0.823322,0.89272,0.763934
5,0.0464,0.767152,0.859729,0.841026,0.878571,0.806557


F1 score: 0.841025641025641

Training with lr=5e-05, epochs=3, weight_decay=0.01


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6483,0.561959,0.767722,0.768769,0.709141,0.839344
2,0.4886,0.457386,0.809955,0.792079,0.797342,0.786885
3,0.3212,0.432375,0.834087,0.813559,0.842105,0.786885


F1 score: 0.8135593220338984

Training with lr=5e-05, epochs=3, weight_decay=0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.564,0.421839,0.797888,0.757246,0.846154,0.685246
2,0.3223,0.379504,0.840121,0.819728,0.85159,0.790164
3,0.1678,0.525706,0.847662,0.824957,0.875,0.780328


F1 score: 0.8249566724436742

Training with lr=5e-05, epochs=5, weight_decay=0.01


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6232,0.531699,0.730015,0.74965,0.653659,0.878689
2,0.408,0.383489,0.846154,0.838608,0.810398,0.868852
3,0.2655,0.409667,0.829563,0.813223,0.82,0.806557
4,0.1412,0.669562,0.856712,0.845024,0.840909,0.84918
5,0.0684,0.818942,0.846154,0.821053,0.883019,0.767213


F1 score: 0.8210526315789474

Training with lr=5e-05, epochs=5, weight_decay=0.001


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5475,0.585848,0.763198,0.677618,0.906593,0.540984
2,0.2991,0.410852,0.85822,0.844371,0.852843,0.836066
3,0.1714,0.51118,0.850679,0.822898,0.905512,0.754098
4,0.0915,0.804835,0.852187,0.829268,0.884758,0.780328
5,0.0431,0.860752,0.859729,0.84048,0.881295,0.803279


F1 score: 0.8404802744425386

Best configuration:
{'learning_rate': 2e-05, 'num_epochs': 5, 'weight_decay': 0.001, 'eval_results': {'eval_loss': 0.7101927399635315, 'eval_accuracy': 0.8748114630467572, 'eval_f1': 0.858603066439523, 'eval_precision': 0.8936170212765957, 'eval_recall': 0.8262295081967214, 'eval_runtime': 4.0606, 'eval_samples_per_second': 163.277, 'eval_steps_per_second': 10.343, 'epoch': 5.0}}


### RoBERTa-based Classifier retrained w/ best hyperparameters (from grid search)

Best configuration (best F1):
- Learning rate: 2e-05
- Weight decay: 0.001
- Num epochs: 5

In [24]:
roberta_gridsearch_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args_roberta = TrainingArguments(
    output_dir="./roberta_gridsearch_results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.001,
    report_to="none"
)

trainer_roberta = Trainer(
    model=roberta_gridsearch_model,
    args=training_args_roberta,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer_roberta.train()
roberta_results = trainer_roberta.evaluate()
print("RoBERTa Evaluation Results:", roberta_results)

trainer_roberta.save_model("./roberta_gridsearch_model")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5598,0.418447,0.808446,0.772809,0.850394,0.708197
2,0.3351,0.384971,0.835596,0.818636,0.831081,0.806557
3,0.2005,0.425482,0.850679,0.82662,0.887218,0.77377
4,0.1345,0.700134,0.846154,0.822917,0.874539,0.777049
5,0.0727,0.747047,0.855204,0.835052,0.877256,0.796721


RoBERTa Evaluation Results: {'eval_loss': 0.7470470070838928, 'eval_accuracy': 0.8552036199095022, 'eval_f1': 0.8350515463917526, 'eval_precision': 0.8772563176895307, 'eval_recall': 0.7967213114754098, 'eval_runtime': 6.8958, 'eval_samples_per_second': 96.146, 'eval_steps_per_second': 6.091, 'epoch': 5.0}


In [25]:
model_path = "roberta_gridsearch"
trainer_roberta.save_model(model_path)
tokenizer_roberta.save_pretrained(model_path)

api = HfApi()

# Replace with your desired repo name
repo_name = "roberta_gridsearch"

# This creates a repo under your namespace (username)
api.create_repo(repo_id=repo_name, private=False, exist_ok=True)

# Push model and tokenizer
trainer_roberta.model.push_to_hub("roberta_gridsearch")
tokenizer_roberta.push_to_hub("roberta_gridsearch")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/VridhiJain/roberta_gridsearch/commit/968a454a9d4c2ee15543a66faa2055045136e533', commit_message='Upload tokenizer', commit_description='', oid='968a454a9d4c2ee15543a66faa2055045136e533', pr_url=None, repo_url=RepoUrl('https://huggingface.co/VridhiJain/roberta_gridsearch', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/roberta_gridsearch'), pr_revision=None, pr_num=None)

## Bayesian Optimization

In [None]:
def model_init():
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

def objective(trial):
  # hyperparameter search space
  learning_rate = trial.suggest_float("learning_rate", 1e-6, 7e-5, log=True)
  weight_decay = trial.suggest_float("weight_decay", 0.0, 0.05)
  batch_size = trial.suggest_categorical("batch_size", [8, 16])
  num_train_epochs = trial.suggest_int("num_train_epochs", 3, 6)

  args = TrainingArguments(
    output_dir=f"./roberta_bayesian_tuned_lr{learning_rate}_ep{num_train_epochs}_wd{weight_decay}",
    eval_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    report_to="none"
  )

  trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
  )

  trainer.train()
  eval_result = trainer.evaluate()
  return eval_result["eval_f1"]

In [None]:
# run optimization loop
study_roberta = optuna.create_study(direction="maximize")
study_roberta.optimize(objective, n_trials=15)

[I 2025-04-18 17:59:45,041] A new study created in memory with name: no-name-e5271385-fb44-4d58-afc4-ddbf463a2155
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.684385,0.53997,0.0,0.0,0.0
2,No log,0.642788,0.722474,0.674912,0.731801,0.62623
3,No log,0.546649,0.748115,0.71453,0.746429,0.685246
4,0.653400,0.508605,0.757164,0.736498,0.735294,0.737705
5,0.653400,0.500801,0.763198,0.737018,0.753425,0.721311


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[I 2025-04-18 18:05:23,179] Trial 0 finished with value: 0.7370184254606366 and parameters: {'learning_rate': 1.3704383108248495e-06, 'weight_decay': 0.004561098737803282, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 0 with value: 0.7370184254606366.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.68003,0.53997,0.0,0.0,0.0
2,No log,0.56657,0.728507,0.720497,0.684366,0.760656
3,No log,0.506493,0.754148,0.72605,0.744828,0.708197
4,0.623000,0.492244,0.757164,0.732113,0.743243,0.721311


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[I 2025-04-18 18:09:54,359] Trial 1 finished with value: 0.7321131447587355 and parameters: {'learning_rate': 1.854264397990088e-06, 'weight_decay': 0.048549755757596624, 'batch_size': 16, 'num_train_epochs': 4}. Best is trial 0 with value: 0.7370184254606366.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.58937,0.784314,0.708758,0.935484,0.570492
2,0.452900,0.618791,0.847662,0.824957,0.875,0.780328
3,0.452900,0.747763,0.843137,0.822526,0.857651,0.790164


[I 2025-04-18 18:13:56,604] Trial 2 finished with value: 0.8225255972696246 and parameters: {'learning_rate': 2.1610709460009314e-05, 'weight_decay': 0.011823374619243899, 'batch_size': 8, 'num_train_epochs': 3}. Best is trial 2 with value: 0.8225255972696246.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.532462,0.79638,0.78673,0.759146,0.816393
2,0.541600,0.490788,0.855204,0.84106,0.849498,0.832787
3,0.541600,0.530985,0.855204,0.83391,0.882784,0.790164
4,0.311500,0.664202,0.86727,0.852843,0.870307,0.836066
5,0.139300,0.829687,0.853695,0.833619,0.874101,0.796721


[I 2025-04-18 18:20:42,197] Trial 3 finished with value: 0.8336192109777015 and parameters: {'learning_rate': 5.659238046209175e-05, 'weight_decay': 0.0235097984380286, 'batch_size': 8, 'num_train_epochs': 5}. Best is trial 3 with value: 0.8336192109777015.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.668964,0.598793,0.24,0.933333,0.137705
2,0.666300,0.523572,0.757164,0.733884,0.74,0.727869
3,0.666300,0.496665,0.766214,0.739496,0.758621,0.721311


[I 2025-04-18 18:24:49,203] Trial 4 finished with value: 0.7394957983193278 and parameters: {'learning_rate': 1.6546991222470181e-06, 'weight_decay': 0.03525385910198006, 'batch_size': 8, 'num_train_epochs': 3}. Best is trial 3 with value: 0.8336192109777015.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.431467,0.806938,0.798742,0.767372,0.832787
2,No log,0.409445,0.841629,0.833068,0.808642,0.859016
3,No log,0.516742,0.828054,0.787313,0.91342,0.691803
4,0.381600,0.473125,0.846154,0.8223,0.877323,0.77377


[I 2025-04-18 18:29:25,150] Trial 5 finished with value: 0.8222996515679443 and parameters: {'learning_rate': 1.228241958773922e-05, 'weight_decay': 0.019261049406513266, 'batch_size': 16, 'num_train_epochs': 4}. Best is trial 3 with value: 0.8336192109777015.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.459297,0.803922,0.795597,0.76435,0.829508
2,No log,0.420493,0.812971,0.776978,0.860558,0.708197
3,No log,0.418063,0.825038,0.799308,0.846154,0.757377


[I 2025-04-18 18:32:48,872] Trial 6 finished with value: 0.7993079584775087 and parameters: {'learning_rate': 7.772709796703433e-06, 'weight_decay': 0.003420479933928311, 'batch_size': 16, 'num_train_epochs': 3}. Best is trial 3 with value: 0.8336192109777015.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.469686,0.793363,0.782884,0.757669,0.809836
2,0.555400,0.433957,0.811463,0.795417,0.794118,0.796721
3,0.555400,0.441763,0.815988,0.789655,0.832727,0.75082


[I 2025-04-18 18:36:52,791] Trial 7 finished with value: 0.7896551724137931 and parameters: {'learning_rate': 4.347182426558614e-06, 'weight_decay': 0.022754751230298204, 'batch_size': 8, 'num_train_epochs': 3}. Best is trial 3 with value: 0.8336192109777015.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.416202,0.806938,0.801858,0.759531,0.84918
2,No log,0.431634,0.834087,0.818482,0.82392,0.813115
3,No log,0.50409,0.865762,0.8576,0.8375,0.878689
4,0.366100,0.638725,0.856712,0.83705,0.877698,0.8
5,0.366100,0.816563,0.840121,0.814035,0.875472,0.760656
6,0.366100,0.794835,0.859729,0.84048,0.881295,0.803279


[I 2025-04-18 18:43:43,390] Trial 8 finished with value: 0.8404802744425386 and parameters: {'learning_rate': 1.584763475793302e-05, 'weight_decay': 0.039178833856049755, 'batch_size': 16, 'num_train_epochs': 6}. Best is trial 8 with value: 0.8404802744425386.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.477744,0.802413,0.756957,0.871795,0.668852
2,0.469500,0.67205,0.829563,0.794171,0.893443,0.714754
3,0.469500,0.7313,0.859729,0.839931,0.884058,0.8
4,0.229700,0.88601,0.84917,0.825175,0.883895,0.77377


[I 2025-04-18 18:49:13,532] Trial 9 finished with value: 0.8251748251748252 and parameters: {'learning_rate': 3.181251417517252e-05, 'weight_decay': 0.04515326933465343, 'batch_size': 8, 'num_train_epochs': 4}. Best is trial 8 with value: 0.8404802744425386.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.485786,0.776772,0.762058,0.747634,0.777049
2,No log,0.42455,0.808446,0.775221,0.842308,0.718033
3,No log,0.462091,0.809955,0.764045,0.89083,0.668852
4,0.469000,0.439943,0.829563,0.812604,0.822148,0.803279
5,0.469000,0.458978,0.832579,0.801431,0.88189,0.734426
6,0.469000,0.450433,0.841629,0.818653,0.864964,0.777049


[I 2025-04-18 18:56:58,444] Trial 10 finished with value: 0.8186528497409327 and parameters: {'learning_rate': 4.798616301657261e-06, 'weight_decay': 0.03559895049206602, 'batch_size': 16, 'num_train_epochs': 6}. Best is trial 8 with value: 0.8404802744425386.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.73917,0.764706,0.695312,0.859903,0.583607
2,0.564500,0.579288,0.852187,0.845912,0.812689,0.881967
3,0.564500,0.51716,0.853695,0.849612,0.805882,0.898361
4,0.329300,0.810634,0.843137,0.814947,0.891051,0.75082
5,0.139900,0.846301,0.84917,0.83871,0.825397,0.852459
6,0.139900,0.923393,0.847662,0.833058,0.84,0.82623


[I 2025-04-18 19:05:48,335] Trial 11 finished with value: 0.8330578512396695 and parameters: {'learning_rate': 6.434267208102996e-05, 'weight_decay': 0.03188136669429958, 'batch_size': 8, 'num_train_epochs': 6}. Best is trial 8 with value: 0.8404802744425386.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.584603,0.764706,0.752381,0.729231,0.777049
2,0.623000,0.602574,0.708899,0.743692,0.625,0.918033
3,0.623000,0.590728,0.797888,0.785942,0.766355,0.806557
4,0.574100,0.540121,0.812971,0.813814,0.750693,0.888525
5,0.420600,0.556936,0.826546,0.811784,0.810458,0.813115


[I 2025-04-18 19:12:53,533] Trial 12 finished with value: 0.8117839607201309 and parameters: {'learning_rate': 6.981541311196224e-05, 'weight_decay': 0.026234899595944796, 'batch_size': 8, 'num_train_epochs': 5}. Best is trial 8 with value: 0.8404802744425386.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.423504,0.81448,0.813354,0.757062,0.878689
2,No log,0.406906,0.840121,0.814685,0.872659,0.763934
3,No log,0.538389,0.85822,0.845395,0.848185,0.842623
4,0.362700,0.738546,0.85822,0.840136,0.872792,0.809836
5,0.362700,0.840279,0.847662,0.821239,0.892308,0.760656
6,0.362700,0.950833,0.855204,0.833333,0.885609,0.786885


[I 2025-04-18 19:20:16,382] Trial 13 finished with value: 0.8333333333333334 and parameters: {'learning_rate': 2.850653900213227e-05, 'weight_decay': 0.03969342985628308, 'batch_size': 16, 'num_train_epochs': 6}. Best is trial 8 with value: 0.8404802744425386.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.428636,0.79638,0.79638,0.73743,0.865574
2,No log,0.433162,0.835596,0.813675,0.85,0.780328
3,No log,0.489632,0.855204,0.838384,0.861592,0.816393
4,0.359800,0.629475,0.862745,0.849088,0.85906,0.839344
5,0.359800,0.724235,0.855204,0.83391,0.882784,0.790164


[I 2025-04-18 19:26:27,509] Trial 14 finished with value: 0.8339100346020761 and parameters: {'learning_rate': 1.6692325906638237e-05, 'weight_decay': 0.027471359951522674, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 8 with value: 0.8404802744425386.


In [None]:
print("Best trial:")
print(study_roberta.best_trial)

Best trial:
FrozenTrial(number=8, state=1, values=[0.8404802744425386], datetime_start=datetime.datetime(2025, 4, 18, 18, 36, 52, 792727), datetime_complete=datetime.datetime(2025, 4, 18, 18, 43, 43, 390425), params={'learning_rate': 1.584763475793302e-05, 'weight_decay': 0.039178833856049755, 'batch_size': 16, 'num_train_epochs': 6}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=7e-05, log=True, low=1e-06, step=None), 'weight_decay': FloatDistribution(high=0.05, log=False, low=0.0, step=None), 'batch_size': CategoricalDistribution(choices=(8, 16)), 'num_train_epochs': IntDistribution(high=6, log=False, low=3, step=1)}, trial_id=8, value=None)


### RoBERTa-based Classifier retrained w/ best hyperparameters (from Bayesian optimization)

Best configuration (best F1):
- Learning rate: 1.584763475793302e-05
- Weight decay: 0.039178833856049755
- Batch size: 16
- Num epochs: 6

In [14]:
roberta_bayesian_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args_roberta = TrainingArguments(
    output_dir="./roberta_bayesian_results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=1.584763475793302e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.039178833856049755,
    report_to="none"
)

trainer_roberta = Trainer(
    model=roberta_bayesian_model,
    args=training_args_roberta,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer_roberta.train()
roberta_results = trainer_roberta.evaluate()
print("RoBERTa Evaluation Results:", roberta_results)

trainer_roberta.save_model("./roberta_bayesian_model")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5738,0.418627,0.81448,0.802568,0.786164,0.819672
2,0.3349,0.442649,0.823529,0.782123,0.905172,0.688525
3,0.2246,0.441595,0.846154,0.811808,0.92827,0.721311
4,0.1368,0.610092,0.850679,0.820976,0.915323,0.744262
5,0.1087,0.709037,0.856712,0.830054,0.913386,0.760656
6,0.0565,0.756097,0.856712,0.829443,0.916667,0.757377


RoBERTa Evaluation Results: {'eval_loss': 0.7560969591140747, 'eval_accuracy': 0.8567119155354449, 'eval_f1': 0.829443447037702, 'eval_precision': 0.9166666666666666, 'eval_recall': 0.7573770491803279, 'eval_runtime': 6.9855, 'eval_samples_per_second': 94.911, 'eval_steps_per_second': 6.012, 'epoch': 6.0}


In [16]:
from huggingface_hub import HfApi

model_path = "roberta_bayesian"
trainer_roberta.save_model(model_path)
tokenizer_roberta.save_pretrained(model_path)

api = HfApi()

# Replace with your desired repo name
repo_name = "roberta_bayesian"

# This creates a repo under your namespace (username)
api.create_repo(repo_id=repo_name, private=False, exist_ok=True)

# Push model and tokenizer
trainer_roberta.model.push_to_hub("roberta_bayesian")
tokenizer_roberta.push_to_hub("roberta_bayesian")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/VridhiJain/roberta_bayesian/commit/d8befe68486ae920292fbb009e379c0c6b035a24', commit_message='Upload tokenizer', commit_description='', oid='d8befe68486ae920292fbb009e379c0c6b035a24', pr_url=None, repo_url=RepoUrl('https://huggingface.co/VridhiJain/roberta_bayesian', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/roberta_bayesian'), pr_revision=None, pr_num=None)

# Bert (Bayesian) + RoBERTa (Bayesian) Ensemble Model - Soft Voting (don't rerun)

In [None]:
# load models
bert_model = AutoModelForSequenceClassification.from_pretrained("./bert_bayesian_model")
roberta_model = RobertaForSequenceClassification.from_pretrained("./roberta_bayesian_model")

bert_model.eval()
roberta_model.eval()

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def soft_voting_ensemble(texts):
  bert_inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
  roberta_inputs = roberta_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

  with torch.no_grad():
    bert_outputs = bert_model(**bert_inputs).logits
    roberta_outputs = roberta_model(**roberta_inputs).logits

    # convert logits to probabilities
    bert_probs = F.softmax(bert_outputs, dim=1)
    roberta_probs = F.softmax(roberta_outputs, dim=1)

    # avg probabilities
    avg_probs = (bert_probs + roberta_probs) / 2

    predictions = torch.argmax(avg_probs, dim=1)

  return predictions, avg_probs

# Bert (Bayesian) + RoBERTa (Bayesian) Ensemble Model - Stacking (don't rerun)

In [None]:
import torch.nn.functional as F
# load models
bert_model = AutoModelForSequenceClassification.from_pretrained("./bert_bayesian_model")
roberta_model = RobertaForSequenceClassification.from_pretrained("./roberta_bayesian_model")

bert_model.eval()
roberta_model.eval()

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def get_model_probs(texts):
    bert_inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    roberta_inputs = roberta_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

    with torch.no_grad():
        bert_outputs = bert_model(**bert_inputs).logits
        roberta_outputs = roberta_model(**roberta_inputs).logits

        bert_probs = F.softmax(bert_outputs, dim=1).cpu().numpy()
        roberta_probs = F.softmax(roberta_outputs, dim=1).cpu().numpy()

    # Stack the probabilities for each example
    stacked_features = np.hstack((bert_probs, roberta_probs))  # shape: (batch_size, 2 * num_classes)
    return stacked_features

# Fit meta-classifier
def train_meta_classifier(texts, labels):
    X = get_model_probs(texts)
    meta_clf = LogisticRegression(max_iter=200, random_state = 42)
    meta_clf.fit(X, labels)
    return meta_clf

# Predict using stacking
def stacking_predict(texts, meta_clf):
    X = get_model_probs(texts)
    preds = meta_clf.predict(X)
    return preds

# Inference for the 20 Headline Sub-Testset

In [None]:
from sklearn.metrics import classification_report

# 20 headline subset
subtest_headlines = [
    "Jack Carr's take on the late Tom Clancy, born on this day in 1947",
    "Feeding America CEO asks community to help others amid today's high inflation",
    "Trump’s campaign rival decides between voting for him or Biden",
    "Could the Republicans' slim House majority slip away before November?",
    "On this day in history, Sept. 6, 1757, Marquis de Lafayette is born, hero of two revolutions",
    "Audrey Hale police bodycams released",
    "Ben Stiller calling Kamala Harris 'change' candidate goes viral",
    "AI fast-tracks dementia diagnoses by tapping into ‘hidden information’ in brain waves",
    "Republican Dave McCormick launches bid for vulnerable Senate seat in battleground state",
    "Artist at center of Supreme Court fight ‘rocked’ by Colorado law that makes her cater to same-sex weddings",
    "World Food Programme Director Cindy McCain: Northern Gaza is in a 'full-blown famine'",
    "Ohio sheriff suggests residents keep a list of homes with Harris yard signs",
    "Dozens of deaths reported in Khan Younis as Israel deepens its ground offensive 'to the west'",
    "Hostage held by Hamas in Gaza rescued by Israeli forces, IDF says",
    "Israeli hostage freed by Hamas says 'time is running out' for captives as she describes harrowing conditions",
    "5 best vacuums to help clean your hardwood floors",
    "Music festival revelers in the Israeli desert were massacred by Hamas militants",
    "The Biden admin has no firm plan to call out domestic disinformation in the 2024 election",
    "Trump campaign, RNC pledge to deploy 100,000 attorneys and volunteers to monitor the vote",
    "Months of intense lobbying persuaded Biden to go to Saudi Arabia, sources say"
]

# True labels: 1 = FoxNews, 0 = NBC
subtest_labels = [1]*10 + [0]*10

# Predict using baseline model
subtest_tfidf = vectorizer.transform(subtest_headlines)
baseline_preds = baseline_model.predict(subtest_tfidf)

# Show results
print("Baseline Model Results")
print(classification_report(subtest_labels, baseline_preds))

Baseline Model Results
              precision    recall  f1-score   support

           0       0.69      0.90      0.78        10
           1       0.86      0.60      0.71        10

    accuracy                           0.75        20
   macro avg       0.77      0.75      0.74        20
weighted avg       0.77      0.75      0.74        20



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #CHAT SAID TO ADD
bert_model.to(device)

# Tokenize and move inputs to same device as model
bert_inputs = tokenizer(
    subtest_headlines,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
bert_inputs = {k: v.to(device) for k, v in bert_inputs.items()}

# Predict
with torch.no_grad():
    bert_outputs = bert_model(**bert_inputs)

bert_preds = torch.argmax(bert_outputs.logits, dim=1).tolist()


# Print results
print("BERT Model Results")
print(classification_report(subtest_labels, bert_preds))

BERT Model Results
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        10
           1       1.00      0.70      0.82        10

    accuracy                           0.85        20
   macro avg       0.88      0.85      0.85        20
weighted avg       0.88      0.85      0.85        20



In [None]:
roberta_model.to(device)

# Tokenize the headlines and move inputs to GPU
roberta_inputs = tokenizer_roberta(
    subtest_headlines,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
roberta_inputs = {k: v.to(device) for k, v in roberta_inputs.items()}

# Run prediction
with torch.no_grad():
    roberta_outputs = roberta_model(**roberta_inputs)

roberta_preds = torch.argmax(roberta_outputs.logits, dim=1).tolist()

# Print results
print("RoBERTa Model Results")
print(classification_report(subtest_labels, roberta_preds))

RoBERTa Model Results
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        10
           1       1.00      0.70      0.82        10

    accuracy                           0.85        20
   macro avg       0.88      0.85      0.85        20
weighted avg       0.88      0.85      0.85        20



In [None]:
# Bert - Grid Search
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_gridsearch_model = AutoModelForSequenceClassification.from_pretrained("./bert_gridsearch_model")
bert_gridsearch_model.to(device)

# Tokenize and move inputs to same device as model
bert_inputs_gridsearch = tokenizer(
    subtest_headlines,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
bert_inputs_gridsearch = {k: v.to(device) for k, v in bert_inputs_gridsearch.items()}

# Predict
with torch.no_grad():
    bert_outputs_gridsearch = bert_gridsearch_model(**bert_inputs_gridsearch)

bert_preds_gridsearch = torch.argmax(bert_outputs_gridsearch.logits, dim=1).tolist()


# Print results
print("BERT (grid search) Model Results")
print(classification_report(subtest_labels, bert_preds_gridsearch))

BERT (grid search) Model Results
              precision    recall  f1-score   support

           0       0.69      0.90      0.78        10
           1       0.86      0.60      0.71        10

    accuracy                           0.75        20
   macro avg       0.77      0.75      0.74        20
weighted avg       0.77      0.75      0.74        20



In [None]:
# Bert - Bayesian optimization
bert_bayesian_model = AutoModelForSequenceClassification.from_pretrained("./bert_bayesian_model")
bert_bayesian_model.to(device)

# Tokenize and move inputs to same device as model
bert_inputs_bayesian = tokenizer(
    subtest_headlines,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
bert_inputs_bayesian = {k: v.to(device) for k, v in bert_inputs_bayesian.items()}

# Predict
with torch.no_grad():
    bert_outputs_bayesian = bert_bayesian_model(**bert_inputs_bayesian)

bert_preds_bayesian = torch.argmax(bert_outputs_bayesian.logits, dim=1).tolist()


# Print results
print("BERT (Bayesian optimization) Model Results")
print(classification_report(subtest_labels, bert_preds_bayesian))

BERT (Bayesian optimization) Model Results
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        10
           1       1.00      0.70      0.82        10

    accuracy                           0.85        20
   macro avg       0.88      0.85      0.85        20
weighted avg       0.88      0.85      0.85        20



In [None]:
# RoBERTa - Grid Search
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
roberta_gridsearch_model = RobertaForSequenceClassification.from_pretrained("./roberta_gridsearch_model")
roberta_gridsearch_model.to(device)

# Tokenize the headlines and move inputs to GPU
roberta_inputs_gridsearch = tokenizer_roberta(
    subtest_headlines,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
roberta_inputs_gridsearch = {k: v.to(device) for k, v in roberta_inputs_gridsearch.items()}

# Run prediction
with torch.no_grad():
    roberta_outputs_gridsearch = roberta_gridsearch_model(**roberta_inputs_gridsearch)

roberta_preds_gridsearch = torch.argmax(roberta_outputs_gridsearch.logits, dim=1).tolist()

# Print results
print("RoBERTa (grid search) Model Results")
print(classification_report(subtest_labels, roberta_preds_gridsearch))

RoBERTa (grid search) Model Results
              precision    recall  f1-score   support

           0       0.75      0.90      0.82        10
           1       0.88      0.70      0.78        10

    accuracy                           0.80        20
   macro avg       0.81      0.80      0.80        20
weighted avg       0.81      0.80      0.80        20



In [None]:
# RoBERTa - Bayesian optimization
roberta_bayesian_model = RobertaForSequenceClassification.from_pretrained("./roberta_bayesian_model")
roberta_bayesian_model.to(device)

# Tokenize the headlines and move inputs to GPU
roberta_inputs_bayesian = tokenizer_roberta(
    subtest_headlines,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
roberta_inputs_bayesian = {k: v.to(device) for k, v in roberta_inputs_bayesian.items()}

# Run prediction
with torch.no_grad():
    roberta_outputs_bayesian = roberta_bayesian_model(**roberta_inputs_bayesian)

roberta_preds_bayesian = torch.argmax(roberta_outputs_bayesian.logits, dim=1).tolist()

# Print results
print("RoBERTa (Bayesian optimization) Model Results")
print(classification_report(subtest_labels, roberta_preds_bayesian))

RoBERTa (Bayesian optimization) Model Results
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        10
           1       1.00      0.70      0.82        10

    accuracy                           0.85        20
   macro avg       0.88      0.85      0.85        20
weighted avg       0.88      0.85      0.85        20



In [None]:
# Bert (Bayesian) + RoBERTa (Bayesian) - soft voting ensemble
ensemble_preds, _ = soft_voting_ensemble(subtest_headlines)

print("Soft Voting Ensemble Results")
print(classification_report(subtest_labels, ensemble_preds))

Soft Voting Ensemble Results
              precision    recall  f1-score   support

           0       0.75      0.90      0.82        10
           1       0.88      0.70      0.78        10

    accuracy                           0.80        20
   macro avg       0.81      0.80      0.80        20
weighted avg       0.81      0.80      0.80        20



In [None]:
import torch.nn.functional as F
# Train the meta-classifier
meta_classifier = train_meta_classifier(df.loc[X_train.index, 'headline'].tolist(), y_train)

# Get predictions from stacking ensemble
stacking_preds = stacking_predict(subtest_headlines, meta_classifier)

print("Stacking Ensemble Results")
print(classification_report(subtest_labels, stacking_preds))

Stacking Ensemble Results
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        10
           1       1.00      0.70      0.82        10

    accuracy                           0.85        20
   macro avg       0.88      0.85      0.85        20
weighted avg       0.88      0.85      0.85        20



In [17]:
%pip install transformers openpyxl ftfy --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
import torch
import numpy as np
from ftfy import fix_text
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.nn.functional import softmax

In [21]:
path = "https://raw.githubusercontent.com/VridhiJ/CIS519/refs/heads/main/Dataset/News%20Classification_Test%20Data_Project.csv"
test_df = pd.read_csv(path)
test_df = test_df.dropna(subset=['Headline'])
test_df['Headline'] = test_df['Headline'].astype(str).apply(fix_text)
print(f"loaded headlines {len(test_df)}")

loaded headlines 1173


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained("VridhiJain/roberta_bayesian")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model.to(device).eval()
def predict_roberta(texts, batch_size=32):
    all_preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            logits = model(**inputs).logits
            preds = torch.argmax(softmax(logits, dim=1), dim=1)
        all_preds.extend(preds.cpu().numpy())
    return all_preds

test_df['Label'] = predict_roberta(test_df['Headline'].tolist())
final_dataset = test_df[['ID', 'Headline', 'Label']]
final_dataset.to_csv("submission_roberta_bayesian.csv", index=False)

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

# Load & Run Model on HuggingFace

In [23]:
test_df.head()

Unnamed: 0,ID,Headline,Label(FoxNews/NBC),Label
0,1,Democrats' boiling pot: A look at their 2026 g...,,0
1,2,Appeals court restores hold on Trump admin's p...,,0
2,3,"David Perdue, former senator and longtime Trum...",,0
3,4,Tesla arson suspect arrested in Arizona after ...,,0
4,5,Trump wants Eagles' tush push to remain in NFL...,,1


In [None]:
!pip install geopy > delete.txt
!pip install datasets > delete.txt
!pip install torch torchvision datasets > delete.txt
!pip install huggingface_hub > delete.txt
!rm delete.txt

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_sy

In [None]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install geopy datasets huggingface_hub

Found existing installation: torch 2.6.0+cu118
Uninstalling torch-2.6.0+cu118:
  Successfully uninstalled torch-2.6.0+cu118
Found existing installation: torchvision 0.21.0+cu118
Uninstalling torchvision-0.21.0+cu118:
  Successfully uninstalled torchvision-0.21.0+cu118
Found existing installation: torchaudio 2.6.0+cu118
Uninstalling torchaudio-2.6.0+cu118:
  Successfully uninstalled torchaudio-2.6.0+cu118
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (27 kB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Using cached https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.

In [None]:
# TODO: create a token on huggingface that allows access to everything (Fine-grained -> check all boxes)
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineG

## Load Model from HuggingFace

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
import pandas as pd
from sklearn.model_selection import train_test_split
import os

## If Students used their own model

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download

# TODO: fill in student repo name
REPO_NAME = "newsclassification/bert"
model = CustomModel.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## If Students used models from huggingface

In [None]:
from huggingface_hub import hf_hub_download
import torch

# TODO: Specify the repository and the filename of the model you want to load
# repo_id =  "newsclassification/bert"
# filename = "<model>.pth"

# Download the model from the Hugging Face Hub
model_path = hf_hub_download(repo_id=repo_id, filename=filename)

# Load the model using torch
model_test = torch.load(model_path)
model_test.eval()  # Set the model to evaluation mode
model = model_test

IndexError: pop from empty list

In [None]:
from huggingface_hub import hf_hub_download
import joblib
model = joblib.load(
	hf_hub_download("NewsSourceClassification/models", "logistic_model_and_vectorizer.pkl")
)

## Below is to test a base bert model

In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
model_name = "news-classification-cis5190/roberta-3epoch-model"
model = RobertaForSequenceClassification.from_pretrained(model_name)
tokenizer = RobertaTokenizer.from_pretrained(model_name)

404 Client Error. (Request ID: Root=1-67f2b906-7bd39c3434b177031f1a9d04;89756773-dacb-4bec-97d4-e7583a47c89d)

Repository Not Found for url: https://huggingface.co/news-classification-cis5190/roberta-3epoch-model/resolve/main/tf_model.h5.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication


OSError: news-classification-cis5190/roberta-3epoch-model is not a local folder or a valid repository name on 'https://hf.co'.

In [None]:
print(model)
print(tokenizer)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## DO NOT MODIFY CODES BELOW

This code is for us to test their model using our test data.
Steps:
- load their data
- preprocess our data and their data


In [None]:
#dataset_train = load_dataset("newsclassification/train_data", split="train")
dataset_test = load_dataset("newsclassification/test_data")

AttributeError: module 'requests' has no attribute 'exceptions'

In [None]:
label_map = {"FoxNews": 1,"NBC": 0}
#dataset_train = dataset_train.map(lambda x: {"labels": label_map[x["outlet"]]})

In [None]:
#print(dataset_train)
print(dataset_test)

In [None]:
#train_test_split = dataset_train.train_test_split(test_size=0.1, seed=42)
#train_data = train_test_split["train"]
#eval_data = train_test_split["test"]

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples["title"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [None]:
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_eval = eval_data.map(preprocess_function, batched=True)

In [None]:
#tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
#tokenized_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

In [None]:
# Load model and vectorizer
loaded_model = model["model"]
loaded_vectorizer = model["vectorizer"]


In [None]:
trainer = Trainer(
    model=loaded_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=loaded_vectorizer,
    compute_metrics=compute_metrics
)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

In [None]:
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

In [None]:
model.save_pretrained("./bert_news_classifier")
tokenizer.save_pretrained("./bert_news_classifier")

In [None]:
model.push_to_hub(REPO_NAME)
tokenizer.push_to_hub(REPO_NAME)

print(f"Model and tokenizer pushed to Hugging Face Hub under the repository: {REPO_NAME}")