### Install dependencies


In [3]:
!pip install ftfy huggingface_hub scikit-learn transformers datasets optuna accelerate==0.27.2 --quiet


### Import Libraries

In [5]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
import random
import numpy as np
import torch
import ftfy

### Load data

In [3]:
url = "https://raw.githubusercontent.com/VridhiJ/CIS519/refs/heads/main/Dataset/news_urls.csv"

# Load the dataset
df = pd.read_csv(url)

# Display the first few rows to verify the data
df.head()

Unnamed: 0,url
0,https://www.foxnews.com/lifestyle/jack-carrs-e...
1,https://www.foxnews.com/entertainment/bruce-wi...
2,https://www.foxnews.com/politics/blinken-meets...
3,https://www.foxnews.com/entertainment/emily-bl...
4,https://www.foxnews.com/media/the-view-co-host...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     3805 non-null   object
dtypes: object(1)
memory usage: 29.9+ KB


### Headline Collection Method

Collect the news headlines by scraping multiple news websites using BeautifulSoup libraries. The scraping process involved:

1. Fetching Webpages:

  - Sending HTTP requests to news article URLs.

  - Using appropriate headers to mimic a real browser and avoid blocking.
    - User-Agent: Identifies the client making request. Helps avoid bot detection by mimicking real browser behavior.
    - Accept-Charset:  Specifies the character encodings that the client can process. Helps ensure proper text rendering.
    - Accept: Defines the type of content the client expects from the server.
    - Accept-Language: Specifies the preferred language for the response content. Helps receive content in a readable format when a website supports multiple languages.
    - referer: Indicates the URL of the page that made the request.
    

2. Extracting Headlines:

  - Parsing the webpage content with BeautifulSoup.

  - Identifying and extracting headlines using H1 tags and class attributes related to headlines.

  - Handling variations in website structures dynamically.

3. Error Handling & Optimization:

  - Implementing error handling to skip unavailable pages.

4. Storing Data:

  - Storing extracted headlines in a structured pandas DataFrame.

 - Saving the data in CSV format for further processing.

This method ensures efficient and scalable data collection while minimizing disruptions caused by website restrictions.

### Data Scraping (don't rerun)

In [6]:
# Helper function to get headline from a single URL
def get_article_headline(url):
  try:
    user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    ]

    session = requests.Session()

    headers = {
    'user-agent': random.choice(user_agents),
    "Accept-Charset": "utf-8",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.9",
    "referer": "https://www.google.com/",
    }
    time.sleep(2)

    response = requests.get(url, headers = headers)

    if response.status_code != 200:
      print(f"Warning: Failed to load page {url} (Status Code: {response.status_code})")
      return None  # Don't stop execution, just return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # To find headline of various types of classes
    headline = soup.find("h1", class_=lambda c: c and "headline" in c)

    if headline:
      headline = ftfy.fix_text(headline.get_text())  # Fix any encoding issues
      return headline.strip()  # Return the cleaned headline
    else:
      return None  # Return None if no headline is found
  except Exception as e:
    print(f"Error processing {url}: {e}")
    return None  # Return None in case of an error

In [7]:
# Create an empty list to store the headlines
headlines = []

# Loop through the URLs in your dataframe
for url in df['url']:
    headline = get_article_headline(url)
    headlines.append(headline)

# Add the scraped headlines to your dataframe
df['headline'] = headlines

# Show the first few rows with the scraped headlines
df.head()



In [None]:
df.to_csv("scraped_headlines.csv", index=False)

In [None]:
from huggingface_hub import login
login()

In [None]:
from huggingface_hub import create_repo

# Create a repository on Hugging Face Hub
repo_name = 'scraped-headlines'
create_repo(repo_name, private=True)

In [None]:
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj='scraped_headlines.csv',
    path_in_repo='scraped_headlines_v4.csv',
    repo_id= 'VridhiJain/scraped-headlines'
)

### Cleaning Data

In [17]:
from huggingface_hub import notebook_login

notebook_login()  # enter your Hugging Face token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
import pandas as pd
from huggingface_hub import hf_hub_download

repo_id = "VridhiJain/scraped-headlines"  # repo name
filename = "scraped_headlines_v4.csv"  # file name

# Download the file
file_path = hf_hub_download(repo_id=repo_id, filename=filename)

# Load into a DataFrame
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,url,headline
0,https://www.foxnews.com/lifestyle/jack-carrs-e...,Jack Carr recalls Gen. Eisenhower's D-Day memo...
1,https://www.foxnews.com/entertainment/bruce-wi...,"Bruce Willis, Demi Moore avoided doing one thi..."
2,https://www.foxnews.com/politics/blinken-meets...,
3,https://www.foxnews.com/entertainment/emily-bl...,Emily Blunt says her 'toes curl' when people t...
4,https://www.foxnews.com/media/the-view-co-host...,"'The View' co-host, CNN commentator Ana Navarr..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       3805 non-null   object
 1   headline  3352 non-null   object
dtypes: object(2)
memory usage: 59.6+ KB


In [9]:
# Check for missing values in the dataset
print(df.isnull().sum())

# Drop any rows where the headline is missing/duplicates
df = df.dropna(subset=['headline']).drop_duplicates(subset=['headline'])

# Reset index after dropping rows
df = df.reset_index(drop=True)

url           0
headline    453
dtype: int64


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3336 entries, 0 to 3335
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       3336 non-null   object
 1   headline  3336 non-null   object
dtypes: object(2)
memory usage: 52.3+ KB


In [11]:
df['url'].str.contains('foxnews').value_counts()

url
False    1779
True     1557
Name: count, dtype: int64

Fox News Headlines: 1779

NBC News Headlines: 1557

### Baseline Model(TF-IDF + Log Regression)

In [8]:
# For reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the preprocessed headline data from Hugging Face
from huggingface_hub import hf_hub_download
csv_path = hf_hub_download(repo_id="VridhiJain/scraped-headlines", filename="scraped_headlines_v4.csv")
df = pd.read_csv(csv_path)

# Drop rows with missing headlines
df = df.dropna(subset=['headline']).drop_duplicates(subset=['headline'])

# Label: 1 for FoxNews, 0 for NBC
df['label'] = df['url'].apply(lambda x: 1 if "foxnews" in x else 0)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['label'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression
baseline_model = LogisticRegression(max_iter=100)
baseline_model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = baseline_model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7036
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.65      0.69       335
           1       0.68      0.76      0.72       333

    accuracy                           0.70       668
   macro avg       0.71      0.70      0.70       668
weighted avg       0.71      0.70      0.70       668



### Bert-based Classifier

In [13]:
import os
import torch
from sklearn.metrics import precision_recall_fscore_support
from datasets import Dataset
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments)

In [14]:
df['label'] = df['url'].apply(lambda x: 1 if "foxnews" in x.lower() else 0)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_dataset = Dataset.from_pandas(train_df[['headline', 'label']])
test_dataset = Dataset.from_pandas(test_df[['headline', 'label']])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["headline"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Load model
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = torch.argmax(torch.tensor(pred.predictions), axis=1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return{"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate()
print("BERT Evaluation Results:")
print(results)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2668 [00:00<?, ? examples/s]

Map:   0%|          | 0/668 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.455493,0.791916,0.778309,0.774603,0.782051
2,No log,0.404934,0.832335,0.818182,0.828947,0.807692
3,0.373600,0.438885,0.821856,0.793043,0.86692,0.730769


BERT Evaluation Results:
{'eval_loss': 0.43888476490974426, 'eval_accuracy': 0.8218562874251497, 'eval_f1': 0.7930434782608695, 'eval_precision': 0.8669201520912547, 'eval_recall': 0.7307692307692307, 'eval_runtime': 5.7994, 'eval_samples_per_second': 115.184, 'eval_steps_per_second': 7.242, 'epoch': 3.0}


In [14]:
from huggingface_hub import HfApi

api = HfApi()

# Repo name
repo_name = "bert_vanilla_2"

# This creates a repo under your namespace (username)
api.create_repo(repo_id=repo_name, private=False, exist_ok=True)


RepoUrl('https://huggingface.co/VridhiJain/bert_vanilla_2', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/bert_vanilla_2')

In [25]:
from huggingface_hub import notebook_login, login

notebook_login()  # enter your Hugging Face token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
# Push model and tokenizer
trainer.model.push_to_hub("bert_vanilla_2")
tokenizer.push_to_hub("bert_vanilla_2")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/VridhiJain/bert_vanilla_2/commit/e6f7f5925674cb968f47a819960aaedfd7a73511', commit_message='Upload tokenizer', commit_description='', oid='e6f7f5925674cb968f47a819960aaedfd7a73511', pr_url=None, repo_url=RepoUrl('https://huggingface.co/VridhiJain/bert_vanilla_2', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/bert_vanilla_2'), pr_revision=None, pr_num=None)

### RoBERTa-based Classifier

In [28]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer_roberta = RobertaTokenizer.from_pretrained("roberta-base")
train_dataset = Dataset.from_pandas(train_df[["headline", "label"]])
test_dataset = Dataset.from_pandas(test_df[["headline", "label"]])

def tokenize_roberta(batch):
    return tokenizer_roberta(batch["headline"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_roberta, batched=True)
test_dataset = test_dataset.map(tokenize_roberta, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args_roberta = TrainingArguments(
    output_dir="./roberta_results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)

trainer_roberta = Trainer(
    model=roberta_model,
    args=training_args_roberta,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer_roberta.train()
roberta_results = trainer_roberta.evaluate()
print("RoBERTa Evaluation Results:", roberta_results)

trainer_roberta.save_model("./roberta_model")

Map:   0%|          | 0/2668 [00:00<?, ? examples/s]

Map:   0%|          | 0/668 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5647,0.428925,0.806886,0.774869,0.850575,0.711538
2,0.3517,0.384277,0.850299,0.84472,0.819277,0.871795


In [18]:
# Repo name
repo_name = "roberta_vanilla_2"

# This creates a repo under your namespace (username)
api.create_repo(repo_id=repo_name, private=False, exist_ok=True)

RepoUrl('https://huggingface.co/VridhiJain/roberta_vanilla_2', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/roberta_vanilla_2')

In [None]:
# Push model and tokenizer
trainer.model.push_to_hub("roberta_vanilla_2")
tokenizer.push_to_hub("roberta_vanilla_2")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/VridhiJain/roberta_vanilla_2/commit/9363dd0596fd13ff974bc4ae19eb893809bb37d6', commit_message='Upload tokenizer', commit_description='', oid='9363dd0596fd13ff974bc4ae19eb893809bb37d6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/VridhiJain/roberta_vanilla_2', endpoint='https://huggingface.co', repo_type='model', repo_id='VridhiJain/roberta_vanilla_2'), pr_revision=None, pr_num=None)

### Bert-based Classifier - Hyperparameter Tuning

In [None]:
from sklearn.model_selection import train_test_split

df['label'] = df['url'].apply(lambda x: 1 if "foxnews" in x.lower() else 0)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_dataset = Dataset.from_pandas(train_df[['headline', 'label']])
test_dataset = Dataset.from_pandas(test_df[['headline', 'label']])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["headline"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/2648 [00:00<?, ? examples/s]

Map:   0%|          | 0/663 [00:00<?, ? examples/s]

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = torch.argmax(torch.tensor(pred.predictions), axis=1).numpy()
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
  acc = accuracy_score(labels, preds)
  return{"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

### Grid Search

In [None]:
from transformers import (AutoModelForSequenceClassification, Trainer, TrainingArguments)
import numpy as np

# define search space
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5]
epochs = [3, 4, 5]
weight_decays = [0.01, 0.001]

best_f1 = 0
best_config = {}

for lr in learning_rates:
  for num_epochs in epochs:
    for wd in weight_decays:
      print(f"\nTraining with lr={lr}, epochs={num_epochs}, weight_decay={wd}")

      training_args = TrainingArguments(
        output_dir=f"./bert_tuned_lr{lr}_ep{num_epochs}_wd{wd}",
        eval_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=wd,
        report_to="none"
      )

      model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

      trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
      )

      trainer.train()
      eval_results = trainer.evaluate()
      print("F1 score:", eval_results['eval_f1'])

      if eval_results['eval_f1'] > best_f1:
        best_f1 = eval_results['eval_f1']
        best_config = {
            "learning_rate": lr,
            "num_epochs": num_epochs,
            "weight_decay": wd,
            "eval_results": eval_results
        }

print("\nBest configuration:")
print(best_config)

ModuleNotFoundError: No module named 'transformers'

### Bert-based Classifier retrained w/ best hyperparameters (from grid search)

Best configuration (best F1):
- Learning rate: 5e-05 
- Num epochs: 5 
- Weight decay: 0.01

### Bayesian Optimization (using optuna)

In [None]:
import optuna
def model_init():
  return AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

def objective(trial):
  # hyperparameter search space
  learning_rate = trial.suggest_float("learning_rate", 1e-6, 5e-5, log=True)
  weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
  batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
  num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)

  args = TrainingArguments(
    output_dir=f"./bert_bayesian_tuned_lr{learning_rate}_ep{num_train_epochs}_wd{weight_decay}",
    eval_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    report_to="none"
  )

  trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
  )

  trainer.train()
  eval_result = trainer.evaluate()
  return eval_result["eval_f1"]


In [None]:
# run optimization loop
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)