In [6]:
"""Data preparation notebook for cleaning BabylonBee dataset for fine-tuning.

This notebook:
1. Loads the BabylonBee CSV dataset
2. Extracts full article content from URLs using ExtractorService
3. Creates a cleaned dataset with Headline and raw article text
4. Saves the processed data to data/fake_news/processed
"""

import sys
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# Determine project root (assuming notebook is run from project root or src/notebooks/)
# Try multiple approaches for robustness
if Path.cwd().name == "LOL-LM":
    PROJECT_ROOT = Path.cwd()
elif (Path.cwd() / "src" / "notebooks").exists():
    PROJECT_ROOT = Path.cwd()
elif (Path.cwd().parent / "src" / "notebooks").exists():
    PROJECT_ROOT = Path.cwd().parent
else:
    # Fallback: assume we're in src/notebooks/ and go up 2 levels
    PROJECT_ROOT = Path(__file__).parent.parent.parent if '__file__' in globals() else Path.cwd().parent.parent

# Add src to path for imports
SRC_DIR = PROJECT_ROOT / "src"
sys.path.insert(0, str(SRC_DIR))

from utils import env  # noqa: F401 - loads .env file
from services.extract_news import ExtractorService
from logger import log

# Set up paths
RAW_DATA_DIR = PROJECT_ROOT / "data" / "fake_news" / "raw"
PROCESSED_DATA_DIR = PROJECT_ROOT / "data" / "fake_news" / "processed"

# Create processed directory if it doesn't exist
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Current working directory: {Path.cwd()}")
print(f"Project root: {PROJECT_ROOT}")
print(f"Raw data dir: {RAW_DATA_DIR}")
print(f"Processed data dir: {PROCESSED_DATA_DIR}")
print(f"Raw data dir exists: {RAW_DATA_DIR.exists()}")


Current working directory: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/src/notebooks
Project root: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM
Raw data dir: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/fake_news/raw
Processed data dir: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/fake_news/processed
Raw data dir exists: True


In [7]:
# Load the BabylonBee dataset
csv_path = RAW_DATA_DIR / "BabylonBee Dataset.csv"

print(f"Loading dataset from: {csv_path}")
df = pd.read_csv(csv_path)

print(f"\nDataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


Loading dataset from: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/fake_news/raw/BabylonBee Dataset.csv

Dataset shape: (10890, 7)

Columns: ['Sl No.', 'Headline', 'Humor', 'Mechanism', 'Article link', 'Image link', 'Unnamed: 6']

First few rows:


Unnamed: 0,Sl No.,Headline,Humor,Mechanism,Article link,Image link,Unnamed: 6
0,0,John Leguizamo's Boycott Of Mario Movie Leads ...,1,GP,https://babylonbee.com/news/john-leguizamos-bo...,https://media.babylonbee.com/articles/6439bbc0...,
1,1,Brave Adventurer Discovers Long-Lost Article H...,1,RL,https://babylonbee.com/news/brave-adventurer-d...,https://media.babylonbee.com/articles/6439c1d1...,
2,2,Drunk Irishmen Say They Understood Biden's Dub...,1,RL,https://babylonbee.com/news/drunk-irishmen-ann...,https://media.babylonbee.com/articles/6439b3f3...,
3,3,Report: Crypto Is A Scam! UPDATE: We Were Wron...,1,GP,https://babylonbee.com/news/report-crypto-is-a...,https://media.babylonbee.com/articles/6439a063...,
4,4,Sports Illustrated Puts Bikini On Walrus For L...,1,CR,https://babylonbee.com/news/sports-illustrated...,https://media.babylonbee.com/articles/64399d6c...,


In [8]:
# Check for missing values and data quality
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nTotal rows: {len(df)}")
print(f"Rows with Article link: {df['Article link'].notna().sum()}")
print(f"Rows with Headline: {df['Headline'].notna().sum()}")

# Filter out rows without Article link or Headline
df_clean = df[df['Article link'].notna() & df['Headline'].notna()].copy()
print(f"\nRows after filtering: {len(df_clean)}")


Missing values per column:
Sl No.              0
Headline            0
Humor               0
Mechanism        1028
Article link        0
Image link          1
Unnamed: 6      10890
dtype: int64

Total rows: 10890
Rows with Article link: 10890
Rows with Headline: 10890

Rows after filtering: 10890


In [9]:
# Initialize the article extractor
extractor = ExtractorService()
print("ExtractorService initialized")


ExtractorService initialized


In [10]:
# Extract articles from URLs in parallel
# This will be much faster than sequential processing

# Thread-safe lists for results
processed_data = []
failed_urls = []
data_lock = Lock()

def extract_single_article(row_tuple):
    """Extract a single article from a row tuple (idx, row)."""
    idx, row = row_tuple
    headline = row['Headline']
    url = row['Article link']
    
    try:
        # Extract article content
        article = extractor.extract_article(url)
        
        if article and article.content:
            return {
                'success': True,
                'data': {
                    'Headline': headline,
                    'Article': article.content  # Raw text content
                }
            }
        else:
            return {
                'success': False,
                'data': {
                    'Headline': headline,
                    'URL': url,
                    'Reason': 'Failed to extract content'
                }
            }
    except Exception as e:
        return {
            'success': False,
            'data': {
                'Headline': headline,
                'URL': url,
                'Reason': f'Error: {str(e)}'
            }
        }

# Prepare data for parallel processing
rows_to_process = list(df_clean.iterrows())
total_rows = len(rows_to_process)

print(f"Processing {total_rows} articles in parallel...")
print(f"Using ThreadPoolExecutor with max_workers=10 (adjustable)\n")

# Use ThreadPoolExecutor for I/O-bound operations (network requests)
# Adjust max_workers based on your system and network capacity
# Too many workers may overwhelm the target server or your network
max_workers = 10  # You can adjust this (5-20 is usually good)

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    future_to_row = {
        executor.submit(extract_single_article, row_tuple): row_tuple 
        for row_tuple in rows_to_process
    }
    
    # Process completed tasks with progress bar
    for future in tqdm(as_completed(future_to_row), total=total_rows, desc="Extracting articles"):
        result = future.result()
        
        # Thread-safe append
        with data_lock:
            if result['success']:
                processed_data.append(result['data'])
            else:
                failed_urls.append(result['data'])

print(f"\n✅ Successfully extracted: {len(processed_data)} articles")
print(f"❌ Failed to extract: {len(failed_urls)} articles")
print(f"📊 Success rate: {len(processed_data) / total_rows * 100:.1f}%")


[32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/john-leguizamos-boycott-of-mario-movie-leads-to-sharp-rise-in-people-googling-who-is-john-leguizamo
[32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/brave-adventurer-discovers-long-lost-article-hidden-beneath-labyrinth-of-pop-ups-privacy-policies
[32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/drunk-irishmen-announce-they-understood-bidens-dublin-speech-perfectly
[32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/report-crypto-is-a-scam-update-we-were-wrong-everyone-buy-crypto-update-oops-crypto-is-a-scam-update-no-maybe-its-not-a-scam


Processing 10890 articles in parallel...
Using ThreadPoolExecutor with max_workers=10 (adjustable)



[32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/sports-illustrated-puts-bikini-on-walrus-for-latest-body-positive-swimsuit-edition
Extracting articles:   0%|          | 0/10890 [00:00<?, ?it/s][32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/parents-just-relieved-teen-who-came-home-drunk-wasnt-drinking-bud-light
[32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/10-things-we-miss-most-about-the-trump-presidency
[32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/feinstein-steps-away-from-judiciary-committee-to-head-towards-bright-light
[32m2025-12-15 04:50:20[0m | [1mINFO[0m | Extracting article from URL: https://babylonbee.com/news/pentagon-leaker-kicking-himself-for-not-just-leaving-classified-documents-strewn-around-his-garage
[32m2025-12-15 04:50:20[0m | [1mINFO[0m 


✅ Successfully extracted: 10793 articles
❌ Failed to extract: 97 articles
📊 Success rate: 99.1%





In [11]:
# Create DataFrame from processed data
df_processed = pd.DataFrame(processed_data)

print(f"Processed dataset shape: {df_processed.shape}")
print(f"\nColumns: {df_processed.columns.tolist()}")
print(f"\nSample of processed data:")
df_processed.head()


Processed dataset shape: (10793, 2)

Columns: ['Headline', 'Article']

Sample of processed data:


Unnamed: 0,Headline,Article
0,Brave Adventurer Discovers Long-Lost Article H...,"MOAB, UT — A historic discovery was made today..."
1,Drunk Irishmen Say They Understood Biden's Dub...,DUBLIN — Despite claims from conservative medi...
2,John Leguizamo's Boycott Of Mario Movie Leads ...,"MANHATTAN, NY — With The Super Mario Bros. Mov..."
3,Pentagon Leaker Kicking Himself For Not Just L...,"DIGHTON, MA — Military police have arrested Ja..."
4,Parents Just Relieved Teen Who Came Home Drunk...,"NEW BRITAIN, PA — Local parents Tim and Julia ..."


In [12]:
# Check data quality
print("Data quality check:")
print(f"Rows with empty Headline: {df_processed['Headline'].isna().sum()}")
print(f"Rows with empty Article: {df_processed['Article'].isna().sum()}")
print(f"Rows with empty Article (after strip): {df_processed['Article'].str.strip().eq('').sum()}")

# Show article length statistics
article_lengths = df_processed['Article'].str.len()
print(f"\nArticle length statistics:")
print(f"  Min: {article_lengths.min()} characters")
print(f"  Max: {article_lengths.max()} characters")
print(f"  Mean: {article_lengths.mean():.0f} characters")
print(f"  Median: {article_lengths.median():.0f} characters")


Data quality check:
Rows with empty Headline: 0
Rows with empty Article: 0
Rows with empty Article (after strip): 0

Article length statistics:
  Min: 110 characters
  Max: 9283 characters
  Mean: 1354 characters
  Median: 1267 characters


In [13]:
# Filter out rows with empty articles
df_final = df_processed[df_processed['Article'].str.strip().ne('')].copy()
print(f"Final dataset after filtering empty articles: {len(df_final)} rows")
print(f"Removed {len(df_processed) - len(df_final)} rows with empty content")


Final dataset after filtering empty articles: 10793 rows
Removed 0 rows with empty content


In [14]:
# Save the processed dataset
output_path = PROCESSED_DATA_DIR / "babylonbee_processed.csv"
df_final.to_csv(output_path, index=False)
print(f"✅ Saved processed dataset to: {output_path}")
print(f"   Total rows: {len(df_final)}")
print(f"   Columns: {df_final.columns.tolist()}")


✅ Saved processed dataset to: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/fake_news/processed/babylonbee_processed.csv
   Total rows: 10793
   Columns: ['Headline', 'Article']


In [15]:
# Optionally save failed extractions for debugging
if failed_urls:
    df_failed = pd.DataFrame(failed_urls)
    failed_path = PROCESSED_DATA_DIR / "babylonbee_failed_extractions.csv"
    df_failed.to_csv(failed_path, index=False)
    print(f"\n⚠️  Saved {len(failed_urls)} failed extractions to: {failed_path}")



⚠️  Saved 97 failed extractions to: /Users/rorosaga/Documents/ie_university/year_4/advanced_ai/repos/LOL-LM/data/fake_news/processed/babylonbee_failed_extractions.csv


In [16]:
# Display a sample of the final processed data
print("Sample of processed data:")
print("=" * 80)
for idx, row in df_final.head(3).iterrows():
    print(f"\nHeadline: {row['Headline']}")
    print(f"Article preview (first 200 chars): {row['Article'][:200]}...")
    print("-" * 80)


Sample of processed data:

Headline: Brave Adventurer Discovers Long-Lost Article Hidden Beneath Labyrinth Of Ads, Pop-Ups, Privacy Policies
Article preview (first 200 chars): MOAB, UT — A historic discovery was made today as a brave adventurer uncovered an internet article long thought to be lost forever underneath layers upon layers of pop-up ads, privacy policies, and ne...
--------------------------------------------------------------------------------

Headline: Drunk Irishmen Say They Understood Biden's Dublin Speech Perfectly
Article preview (first 200 chars): DUBLIN — Despite claims from conservative media pundits that President Joe Biden's tour of Ireland was a disaster due to several verbal gaffes and nonsensical statements, a group of intoxicated Irishm...
--------------------------------------------------------------------------------

Headline: John Leguizamo's Boycott Of Mario Movie Leads To Sharp Rise In People Googling 'Who Is John Leguizamo?'
Article preview (first 200