## **Data Preparation**

Scraping all relevant data needed on maternal health care and reproductive system from the appropriate websites. 

World Health Organization

In [6]:
#libraries
import requests
from bs4 import BeautifulSoup
import re
import json
import os
from urllib.parse import urlparse
#from scripts.loggingsetup import error_logger, success_logger
print("Libraries imported")

Libraries imported


In [29]:
#function to fetch and save structured data from a web page

def get_data(url):
    try:
        # Get the data from the website
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise error if bad response

        # Parse the data using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all sections with article content
        sections = soup.find_all("div", {"data-testid": "tabbed-article-section"})

        if not sections:
            print("❌ No article content found.")
            return None

        # Combine all the article text
        article_text = ""

        for section in sections:
            for tag in section.find_all(['h2', 'h3', 'p']):
                text = tag.get_text(strip=True)
                if not text:
                    continue
            #ensuring that there is space between headings and paragraphs
                if tag.name == 'h2':
                    article_text += f"\n\n## {text}\n\n"
                elif tag.name == 'h3':
                    article_text += f"\n\n### {text}\n\n"
                else:  # paragraph
                    article_text += f"{text}\n\n"

        # Generate a file name based on URL path
        path = urlparse(url).path.strip('/')
        file_name = path.replace('/', '_') or 'index'
        file_path = f"{file_name}.txt"

        # Save to file in the data folder
        data_folder = r"C:\Projects_ML\AI-for-Maternal-HealthCare\data"
        if not os.path.exists(data_folder):
            os.makedirs(data_folder)

        full_path = os.path.join(data_folder, file_path)
        with open(full_path, 'w', encoding='utf-8') as f:
            f.write(article_text)

        print(f"✅ Data saved to {full_path}")
        return article_text

    except Exception as e:
        print(f"❌ Error processing {url}: {str(e)}")
        return None


In [30]:
#data extraction
get_data("https://psychcentral.com/depression/depression-busters-for-new-moms")

✅ Data saved to C:\Projects_ML\AI-for-Maternal-HealthCare\data\depression_depression-busters-for-new-moms.txt


"Postpartum depression can affect anyone. Here are some tips for coping with it.\n\nThe birth of a baby can bring a lot of change to someone’s life. Some of these changes are expected, like lots of dirty diapers, new feeding schedules, and sleep deprivation.\n\nBut some changes are less expected — such as those that affect your mental and emotional health, likepostpartum depression (PPD).\n\nPPD is a form ofmajor depressive disorder (MDD)that develops in a parent in the year following the birth of their child. Many factors maycause postpartum depression.\n\n“It’s similar to major depression in that the main symptoms are feeling depressed and disinterested,” explains Kristin Calverley, a licensed psychologist in Texas certified in perinatal mental health and owner ofInner Balance Psychological Services.\n\nOther symptoms you might experience include, but aren’t limited, to:\n\nIf you’re experiencing any of these symptoms for several days or weeks on end, you might be experiencing PPD. W

In [None]:
#function 2 
def get_data2(url):
    try:
        import requests
        from bs4 import BeautifulSoup
        from urllib.parse import urlparse
        import os

        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Look for the main content div
        article_container = soup.find("div", class_="entry-content")
        if not article_container:
            print("❌ Could not find article container with class 'entry-content'.")
            return None

        # Extract text content from all paragraphs and headings
        article_text = ""
        for tag in article_container.find_all(['p', 'h2', 'h3']):
            text = tag.get_text(strip=True)
            if text:
                article_text += text + "\n\n"

        # Create a clean filename from the URL
        path = urlparse(url).path.strip('/')
        file_name = path.replace('/', '_') or 'index'
        file_path = f"{file_name}.txt"

        # Save to file
        data_folder = r"C:\Projects_ML\AI-for-Maternal-HealthCare\data"
        os.makedirs(data_folder, exist_ok=True)
        full_path = os.path.join(data_folder, file_path)

        with open(full_path, 'w', encoding='utf-8') as f:
            f.write(article_text)

        print(f"✅ Article saved to {full_path}")
        return article_text

    except Exception as e:
        print(f"❌ Error processing {url}: {str(e)}")
        return None
