### Install dependencies


In [1]:
!pip install ftfy huggingface_hub scikit-learn transformers datasets optuna --quiet


### Import Libraries

In [5]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
import random
import numpy as np
import torch
import ftfy

### Load data

In [3]:
url = "https://raw.githubusercontent.com/VridhiJ/CIS519/refs/heads/main/Dataset/news_urls.csv"

# Load the dataset
df = pd.read_csv(url)

# Display the first few rows to verify the data
df.head()

Unnamed: 0,url
0,https://www.foxnews.com/lifestyle/jack-carrs-e...
1,https://www.foxnews.com/entertainment/bruce-wi...
2,https://www.foxnews.com/politics/blinken-meets...
3,https://www.foxnews.com/entertainment/emily-bl...
4,https://www.foxnews.com/media/the-view-co-host...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     3805 non-null   object
dtypes: object(1)
memory usage: 29.9+ KB


### Headline Collection Method

Collect the news headlines by scraping multiple news websites using BeautifulSoup libraries. The scraping process involved:

1. Fetching Webpages:

  - Sending HTTP requests to news article URLs.

  - Using appropriate headers to mimic a real browser and avoid blocking.
    - User-Agent: Identifies the client making request. Helps avoid bot detection by mimicking real browser behavior.
    - Accept-Charset:  Specifies the character encodings that the client can process. Helps ensure proper text rendering.
    - Accept: Defines the type of content the client expects from the server.
    - Accept-Language: Specifies the preferred language for the response content. Helps receive content in a readable format when a website supports multiple languages.
    - referer: Indicates the URL of the page that made the request.
    

2. Extracting Headlines:

  - Parsing the webpage content with BeautifulSoup.

  - Identifying and extracting headlines using H1 tags and class attributes related to headlines.

  - Handling variations in website structures dynamically.

3. Error Handling & Optimization:

  - Implementing error handling to skip unavailable pages.

4. Storing Data:

  - Storing extracted headlines in a structured pandas DataFrame.

 - Saving the data in CSV format for further processing.

This method ensures efficient and scalable data collection while minimizing disruptions caused by website restrictions.

### Data Scraping (don't rerun)

In [6]:
# Helper function to get headline from a single URL
def get_article_headline(url):
  try:
    user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    ]

    session = requests.Session()

    headers = {
    'user-agent': random.choice(user_agents),
    "Accept-Charset": "utf-8",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.9",
    "referer": "https://www.google.com/",
    }
    time.sleep(2)

    response = requests.get(url, headers = headers)

    if response.status_code != 200:
      print(f"Warning: Failed to load page {url} (Status Code: {response.status_code})")
      return None  # Don't stop execution, just return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # To find headline of various types of classes
    headline = soup.find("h1", class_=lambda c: c and "headline" in c)

    if headline:
      headline = ftfy.fix_text(headline.get_text())  # Fix any encoding issues
      return headline.strip()  # Return the cleaned headline
    else:
      return None  # Return None if no headline is found
  except Exception as e:
    print(f"Error processing {url}: {e}")
    return None  # Return None in case of an error

In [7]:
# Create an empty list to store the headlines
headlines = []

# Loop through the URLs in your dataframe
for url in df['url']:
    headline = get_article_headline(url)
    headlines.append(headline)

# Add the scraped headlines to your dataframe
df['headline'] = headlines

# Show the first few rows with the scraped headlines
df.head()



In [None]:
df.to_csv("scraped_headlines.csv", index=False)

In [None]:
from huggingface_hub import login
login()

In [None]:
from huggingface_hub import create_repo

# Create a repository on Hugging Face Hub
repo_name = 'scraped-headlines'
create_repo(repo_name, private=True)

In [None]:
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj='scraped_headlines.csv',
    path_in_repo='scraped_headlines_v4.csv',
    repo_id= 'VridhiJain/scraped-headlines'
)

### Cleaning Data

In [23]:
from huggingface_hub import login

login()  # enter your Hugging Face token

In [24]:
import pandas as pd
from huggingface_hub import hf_hub_download

repo_id = "VridhiJain/scraped-headlines"  # repo name
filename = "scraped_headlines_v4.csv"  # file name

# Download the file
file_path = hf_hub_download(repo_id=repo_id, filename=filename)

# Load into a DataFrame
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,url,headline
0,https://www.foxnews.com/lifestyle/jack-carrs-e...,Jack Carr recalls Gen. Eisenhower's D-Day memo...
1,https://www.foxnews.com/entertainment/bruce-wi...,"Bruce Willis, Demi Moore avoided doing one thi..."
2,https://www.foxnews.com/politics/blinken-meets...,
3,https://www.foxnews.com/entertainment/emily-bl...,Emily Blunt says her 'toes curl' when people t...
4,https://www.foxnews.com/media/the-view-co-host...,"'The View' co-host, CNN commentator Ana Navarr..."


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       3805 non-null   object
 1   headline  3352 non-null   object
dtypes: object(2)
memory usage: 59.6+ KB


In [26]:
# Check for missing values in the dataset
print(df.isnull().sum())

# Drop any rows where the headline is missing/duplicates
df = df.dropna(subset=['headline']).drop_duplicates(subset=['headline'])

# Reset index after dropping rows
df = df.reset_index(drop=True)

url           0
headline    453
dtype: int64


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3336 entries, 0 to 3335
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       3336 non-null   object
 1   headline  3336 non-null   object
dtypes: object(2)
memory usage: 52.3+ KB


In [28]:
df['url'].str.contains('foxnews').value_counts()

url
False    1779
True     1557
Name: count, dtype: int64

Fox News Headlines: 1779

NBC News Headlines: 1557

### Baseline Model(TF-IDF + Log Regression)

In [29]:
# For reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the preprocessed headline data from Hugging Face
from huggingface_hub import hf_hub_download
csv_path = hf_hub_download(repo_id="VridhiJain/scraped-headlines", filename="scraped_headlines_v2.csv")
df = pd.read_csv(csv_path)

# Drop rows with missing headlines
df = df.dropna(subset=['headline'])

# Label: 1 for FoxNews, 0 for NBC
df['label'] = df['url'].apply(lambda x: 1 if "foxnews" in x else 0)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['label'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression
baseline_model = LogisticRegression(max_iter=100)
baseline_model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = baseline_model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

scraped_headlines_v2.csv:   0%|          | 0.00/687k [00:00<?, ?B/s]

Accuracy: 0.6682
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.77      0.70       333
           1       0.71      0.57      0.63       330

    accuracy                           0.67       663
   macro avg       0.67      0.67      0.66       663
weighted avg       0.67      0.67      0.66       663

