In [1]:
pip install requests beautifulsoup4





In [3]:
import requests
from bs4 import BeautifulSoup
import time
import os

# List of URLs to scrape
urls = [
    "https://www.healthsystemtracker.org/chart-collection/quality-u-s-healthcare-system-compare-countries/",
    "https://www.healthsystemtracker.org/brief/how-many-adults-with-private-health-insurance-could-use-glp-1-drugs/",
    "https://www.healthsystemtracker.org/chart-collection/beyond-cost-what-barriers-to-health-care-do-consumers-face/",
    "https://www.healthsystemtracker.org/chart-collection/what-are-the-recent-trends-in-health-utilization-and-spending/",
    "https://www.healthsystemtracker.org/brief/can-econsults-reduce-commercial-health-spending-learnings-from-an-arkansas-pilot-study/",
    "https://www.healthsystemtracker.org/brief/how-many-people-with-employer-sponsored-insurance-use-the-drugs-slated-for-medicare-price-negotiations/",
    "https://www.healthsystemtracker.org/brief/how-much-and-why-aca-marketplace-premiums-are-going-up-in-2025/",
    "https://www.healthsystemtracker.org/brief/how-does-medical-inflation-compare-to-inflation-in-the-rest-of-the-economy/",
    "https://www.healthsystemtracker.org/brief/what-drives-health-spending-in-the-u-s-compared-to-other-countries/",
    "https://www.healthsystemtracker.org/brief/insurer-strategies-to-control-costs-associated-with-weight-loss-drugs/",
    "https://www.healthsystemtracker.org/brief/medical-debt-among-new-mothers/",
    "https://www.healthsystemtracker.org/chart-collection/what-are-the-recent-trends-health-sector-employment/",
    "https://www.healthsystemtracker.org/brief/charges-for-emails-with-doctors-and-other-healthcare-providers/",
    "https://www.healthsystemtracker.org/brief/the-burden-of-medical-debt-in-the-united-states/",
    "https://www.healthsystemtracker.org/brief/how-financially-vulnerable-are-people-with-medical-debt/",
    "https://www.healthsystemtracker.org/brief/policy-issues-and-trends-2024/",
    "https://www.healthsystemtracker.org/chart-collection/health-spending-u-s-compare-countries/",
    "https://www.healthsystemtracker.org/chart-collection/cost-affect-access-care/",
    "https://www.healthsystemtracker.org/chart-collection/health-expenditures-vary-across-population/",
    "https://www.healthsystemtracker.org/chart-collection/trends-in-employer-based-health-coverage/",
    "https://www.healthsystemtracker.org/brief/surprise-billing-disputes-year-one/",
    "https://www.healthsystemtracker.org/brief/telehealth-payments-similar-early-in-the-pandemic/"
]

def fetch_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_content(html):
    soup = BeautifulSoup(html, "html.parser")
    
    # Extract title
    title = soup.title.get_text(strip=True) if soup.title else "No title"
    
    # Extract all paragraph texts
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
    
    # Extract all links (href attributes)
    links = [a["href"] for a in soup.find_all("a", href=True)]
    
    return {
        "title": title,
        "paragraphs": paragraphs,
        "links": links
    }

def save_to_file(data, url, folder="scraped_data"):
    # Create a valid filename from the URL (or use an index)
    safe_url = url.replace("https://", "").replace("http://", "").replace("/", "_").strip("_")
    filename = os.path.join(folder, f"{safe_url}.txt")
    
    # Ensure the folder exists
    os.makedirs(folder, exist_ok=True)
    
    try:
        with open(filename, "w", encoding="utf-8") as f:
            f.write(f"URL: {url}\n")
            f.write(f"Title: {data.get('title', 'N/A')}\n\n")
            
            f.write("Paragraphs:\n")
            for para in data.get("paragraphs", []):
                f.write(para + "\n")
            f.write("\nLinks:\n")
            for link in data.get("links", []):
                f.write(link + "\n")
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error saving data to {filename}: {e}")

def main():
    for url in urls:
        print(f"Scraping: {url}")
        html = fetch_page(url)
        if html:
            data = parse_content(html)
            save_to_file(data, url)
        # Respectful delay between requests
        time.sleep(2)

if __name__ == "__main__":
    main()


Scraping: https://www.healthsystemtracker.org/chart-collection/quality-u-s-healthcare-system-compare-countries/
Data saved to scraped_data\www.healthsystemtracker.org_chart-collection_quality-u-s-healthcare-system-compare-countries.txt
Scraping: https://www.healthsystemtracker.org/brief/how-many-adults-with-private-health-insurance-could-use-glp-1-drugs/
Data saved to scraped_data\www.healthsystemtracker.org_brief_how-many-adults-with-private-health-insurance-could-use-glp-1-drugs.txt
Scraping: https://www.healthsystemtracker.org/chart-collection/beyond-cost-what-barriers-to-health-care-do-consumers-face/
Data saved to scraped_data\www.healthsystemtracker.org_chart-collection_beyond-cost-what-barriers-to-health-care-do-consumers-face.txt
Scraping: https://www.healthsystemtracker.org/chart-collection/what-are-the-recent-trends-in-health-utilization-and-spending/
Data saved to scraped_data\www.healthsystemtracker.org_chart-collection_what-are-the-recent-trends-in-health-utilization-and-s