In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd

"""
    Scrapes article content from a specified URL by extracting titles and their corresponding paragraphs.

    Parameters:
        url: The URL of the webpage to scrape.

    Returns:
        A list of dictionaries, where each dictionary contains:
            'Title': The section title (from <h2> tags).
            'Content': The associated paragraphs (from <p> tags).
"""

def scrape_article_content(url):
    
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')

    main_content = soup.find('main')
    articles = []

    if main_content:
        # Extract sections containing articles
        article_sections = main_content.find_all('h2')  # Headings <h2> indicate sections

        for section in article_sections:
            title = section.get_text(strip=True)

            # Extract content paragraphs following the title
            paragraphs = []
            for sibling in section.find_next_siblings():
                if sibling.name == 'h2':
                    break
                if sibling.name == 'p': 
                    paragraphs.append(sibling.get_text(strip=True))
            
            content = " ".join(paragraphs)
            if title and content:
                articles.append({'Title': title, 'Content': content})

    return articles

"""
    Saves the scraped data into a CSV file after cleaning rows with empty content.

    Parameters:
        data: A list of dictionaries containing 'Title' and 'Content'.
        file_name: The name of the CSV file to save the data.

    Returns:
        None. The cleaned data is saved to the specified CSV file.
"""

def save_to_csv(data, file_name):
    clean_data = [row for row in data if row['Content'].strip()]
    df = pd.DataFrame(clean_data)
    df.to_csv(file_name, index=False)
    print(f"Data saved to {file_name}")

# List of URLs to scrape
urls = [
    "https://www.health.harvard.edu/exercise-and-fitness/essential-stretches-to-fight-stiff-winter-muscles",
    "https://www.health.harvard.edu/topics/exercise-and-fitness",
    "https://www.health.harvard.edu/diseases-and-conditions",
    "https://www.health.harvard.edu/exercise-and-fitness/how-to-love-the-exercises-you-hate",
    "https://www.health.harvard.edu/exercise-and-fitness/hiit-workouts-for-older-adults-a-guide-to-safe-and-effective-high-intensity-interval-training",
    "https://www.health.harvard.edu/exercise-and-fitness/calisthenics-an-effective-low-frills-way-to-stay-fit",
    "https://www.health.harvard.edu/exercise-and-fitness/cognitive-benefits-from-high-intensity-interval-training-may-last-for-years",
    "https://www.health.harvard.edu/blog/boosting-your-childs-immune-system-202110122614",
    "https://www.health.harvard.edu/exercise-and-fitness/fit-balance-exercises-into-a-busy-day",
    "https://www.health.harvard.edu/staying-healthy/abdominal-fat-and-what-to-do-about-it",
    "https://www.health.harvard.edu/staying-healthy/does-exercising-at-night-affect-sleep",
    "https://www.health.harvard.edu/topics/diet-and-weight-loss#diet-weight-loss0",
    "https://www.health.harvard.edu/blog/preventing-sudden-heart-death-in-children-4-questions-can-help-202107092540",
    "https://www.health.harvard.edu/staying-healthy/tips-to-avoid-constipation",
    "https://www.health.harvard.edu/heart-health/what-your-heart-rate-is-telling-you",
    "https://www.health.harvard.edu/exercise-and-fitness/walking-advice-from-a-master-walker",
    "https://www.health.harvard.edu/pain/exercise-an-effective-prescription-for-joint-pain",
    "https://www.health.harvard.edu/blog/does-running-cause-arthritis-202304262930",
    "https://www.health.harvard.edu/staying-healthy/extra-support-for-better-health",
    "https://www.health.harvard.edu/staying-healthy/dont-let-muscle-mass-go-to-waste",
    "https://www.health.harvard.edu/heart-health/exercising-when-you-have-a-heart-condition",
    "https://www.health.harvard.edu/exercise-and-fitness/age-and-muscle-loss",
    "https://www.health.harvard.edu/exercise-and-fitness/whats-the-minimum-amount-of-exercise-i-need-each-week",
    "https://www.health.harvard.edu/exercise-and-fitness/adding-strength-training-to-aerobic-exercise-may-fuel-longevity",
    "https://www.health.harvard.edu/exercise-and-fitness/rethinking-cardio-exercise",
    "https://www.health.harvard.edu/exercise-and-fitness/the-best-exercises-for-your-warm-up",
    "https://www.health.harvard.edu/mind-and-mood/a-workout-for-your-brain",
    "https://www.health.harvard.edu/nutrition/xylitol-what-to-know-about-this-popular-sugar-substitute",
    "https://www.health.harvard.edu/staying-healthy/a-healthy-lifestyle-late-in-life-still-offers-benefits",
    "https://www.health.harvard.edu/heart-health/grain-of-the-month-brown-rice",
    "https://www.health.harvard.edu/blog/weighing-in-on-weight-gain-from-antidepressants-202408023063",
    "https://www.health.harvard.edu/staying-healthy/should-i-use-a-continuous-glucose-monitor",
    "https://www.health.harvard.edu/staying-healthy/tips-to-change-your-night-owl-lifestyle",
    "https://www.health.harvard.edu/nutrition/what-are-the-differences-between-popular-low-carb-diets",
    "https://www.health.harvard.edu/staying-healthy/the-truth-about-metabolism",
    "https://www.health.harvard.edu/nutrition/what-is-the-carnivore-diet",
    "https://www.health.harvard.edu/heart-health/keto-diet-is-not-healthy-and-may-harm-the-heart",
    "https://www.health.harvard.edu/staying-healthy/can-berberine-help-me-lose-weight",
    "https://www.health.harvard.edu/staying-healthy/losing-weight-can-help-you-lose-the-pain-too",
    "https://www.health.harvard.edu/staying-healthy/calorie-counting-made-easy",
    "https://www.health.harvard.edu/nutrition/eating-high-quality-carbohydrates-may-stave-off-middle-age-weight-gain",
    "https://www.health.harvard.edu/newsletter_article/taking-aim-at-belly-fat",
    "https://www.health.harvard.edu/blog/does-drinking-water-before-meals-really-help-you-lose-weight-202402203018",
    "https://www.health.harvard.edu/staying-healthy/what-can-i-do-about-my-beer-belly",
    "https://www.health.harvard.edu/staying-healthy/questions-and-answers-about-the-new-anti-obesity-medications",
]


all_articles = []

for url in urls:
    print(f"Scraping: {url}")
    scraped_data = scrape_article_content(url)
    all_articles.extend(scraped_data)

save_to_csv(all_articles, "articles_combined_all_1.csv")

Scraping: https://www.health.harvard.edu/exercise-and-fitness/essential-stretches-to-fight-stiff-winter-muscles
Scraping: https://www.health.harvard.edu/topics/exercise-and-fitness
Scraping: https://www.health.harvard.edu/diseases-and-conditions
Scraping: https://www.health.harvard.edu/exercise-and-fitness/how-to-love-the-exercises-you-hate
Scraping: https://www.health.harvard.edu/exercise-and-fitness/hiit-workouts-for-older-adults-a-guide-to-safe-and-effective-high-intensity-interval-training
Scraping: https://www.health.harvard.edu/exercise-and-fitness/calisthenics-an-effective-low-frills-way-to-stay-fit
Scraping: https://www.health.harvard.edu/exercise-and-fitness/cognitive-benefits-from-high-intensity-interval-training-may-last-for-years
Scraping: https://www.health.harvard.edu/blog/boosting-your-childs-immune-system-202110122614
Scraping: https://www.health.harvard.edu/exercise-and-fitness/fit-balance-exercises-into-a-busy-day
Scraping: https://www.health.harvard.edu/staying-healt