In [None]:
import time
import logging
from pathlib import Path

from config import ScraperConfig
from wayback_scraper import WaybackMachineScraper
from url_extractor import URLExtractor
from article_fetcher import ArticleFetcher

logger = logging.getLogger(__name__)

# Initialize configuration
config = ScraperConfig()

In [None]:
wayback_scraper = WaybackMachineScraper(config)

# Process each target site
for site_name in config.target_sites:
    logger.info(f"Processing {site_name}")
    
    # Step 1: Get Wayback Machine snapshots
    step_start = time.time()
    wayback_scraper.get_snapshots(site_name)
    logger.info(f"Step 1 (Get snapshots) completed in {time.time() - step_start:.2f} seconds")

In [None]:
url_extractor = URLExtractor(config)

for site_name in config.target_sites:
    logger.info(f"Processing {site_name}")

    # Step 2: Extract URLs from snapshots
    step_start = time.time()
    url_extractor.extract_urls(site_name)
    logger.info(f"Step 2 (Extract URLs) completed in {time.time() - step_start:.2f} seconds")

In [None]:
article_fetcher = ArticleFetcher(config)

for site_name in config.target_sites:
    logger.info(f"Processing {site_name}")
    
    # Step 3: Fetch articles
    step_start = time.time()
    article_fetcher.fetch_articles(site_name)
    logger.info(f"Step 3 (Fetch articles) completed in {time.time() - step_start:.2f} seconds")