In [9]:
import time
import logging
from pathlib import Path

from config import ScraperConfig
from wayback_scraper import WaybackMachineScraper
from url_extractor import URLExtractor
from article_fetcher import ArticleFetcher

logger = logging.getLogger(__name__)

In [None]:
"""Main function to run the news scraper"""
start_time = time.time()
logger.info("Starting news scraper")

# Initialize configuration
config = ScraperConfig()

# Initialize scrapers
wayback_scraper = WaybackMachineScraper(config)
url_extractor = URLExtractor(config)
article_fetcher = ArticleFetcher(config)

# Process each target site
for site_name in config.target_sites:
    logger.info(f"Processing {site_name}")
    
    # Step 1: Get Wayback Machine snapshots
    step_start = time.time()
    wayback_scraper.get_snapshots(site_name)
    logger.info(f"Step 1 (Get snapshots) completed in {time.time() - step_start:.2f} seconds")
    
    # # Step 2: Extract URLs from snapshots
    # step_start = time.time()
    # url_extractor.extract_urls(site_name)
    # logger.info(f"Step 2 (Extract URLs) completed in {time.time() - step_start:.2f} seconds")
    
    # # Step 3: Fetch articles
    # step_start = time.time()
    # article_fetcher.fetch_articles(site_name)
    # logger.info(f"Step 3 (Fetch articles) completed in {time.time() - step_start:.2f} seconds")

total_time = time.time() - start_time
logger.info(f"News scraper completed in {total_time:.2f} seconds")

2025-03-19 01:09:05,032 - INFO - Starting news scraper
2025-03-19 01:09:05,037 - INFO - Loaded configuration with 2 target sites
2025-03-19 01:09:05,040 - INFO - Processing cnn
2025-03-19 01:09:05,043 - INFO - Fetching Wayback Machine snapshots for cnn (https://cnn.com/us) from 2020 to 2023
2025-03-19 01:09:05,043 - INFO - Checking snapshots for https://cnn.com/us
2025-03-19 01:09:20,711 - INFO - Snapshot URL points to https://www.cnn.com/us, which differs from requested https://cnn.com/us
2025-03-19 01:09:20,715 - INFO - Found snapshot for https://cnn.com/us on 2020-1-2: http://web.archive.org/web/20200103001622/https://www.cnn.com/us
2025-03-19 01:09:20,716 - INFO - Saved snapshot for https://cnn.com/us on 2020-1-2
2025-03-19 01:09:22,366 - INFO - Snapshot URL points to https://www.cnn.com/us, which differs from requested https://cnn.com/us
2025-03-19 01:09:22,367 - INFO - Found snapshot for https://cnn.com/us on 2020-1-3: http://web.archive.org/web/20200104003023/https://www.cnn.com