In [5]:
# Simple WebMD Medicinal Plant Scraper
# This script discovers medicinal plants from WebMD and extracts their information

# Step 1: Import the tools we need
import requests  # This helps us visit websites (like opening a browser)
from bs4 import BeautifulSoup  # This helps us read and understand HTML
import pandas as pd  # This helps us organize data in tables (like Excel)
import time  # This helps us add delays between requests

# Step 2: Set up our "browser"
# Websites need to know what's visiting them, so we pretend to be a regular browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# Step 3: Define the main function that gets a list of herbs
def get_herb_list():
    """
    This function gets a list of all herbs from WebMD's alphabetical listing.
    We'll start with just letter 'A' to keep it simple.
    """
    print("Getting list of herbs from WebMD...")
    
    # The URL where WebMD lists herbs starting with 'A'
    url = "https://www.webmd.com/vitamins-and-supplements/vitamins-supplements-a-z-list/a"
    
    # Step 3.1: Visit the webpage
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        # Check if the page loaded successfully (status code 200 means success)
        if response.status_code == 200:
            print("âœ“ Successfully connected to WebMD")
        else:
            print(f"âœ— Problem: Got status code {response.status_code}")
            return []
    
    except Exception as e:
        print(f"âœ— Error connecting to website: {e}")
        return []
    
    # Step 3.2: Parse (read and understand) the HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Step 3.3: Find all the herb links
    # On WebMD, herb links look like: /vitamins/ai/ingredientmono-XXX/name
    herb_links = soup.find_all('a', href=lambda href: href and '/vitamins/ai/ingredientmono-' in href)
    
    print(f"âœ“ Found {len(herb_links)} herbs starting with 'A'")
    
    # Step 3.4: Create a list to store herb information
    herb_list = []
    
    # Step 3.5: Go through each herb link (but limit to 5 for now to test)
    for i, link in enumerate(herb_links[:5], 1):
        # Get the herb name from the link text
        herb_name = link.get_text(strip=True)
        
        # Get the full URL to the herb's page
        herb_url = "https://www.webmd.com" + link['href']
        
        print(f"\n[{i}/5] Processing: {herb_name}")
        print(f"    URL: {herb_url}")
        
        # Get detailed information about this herb
        herb_info = get_herb_details(herb_url, herb_name)
        
        if herb_info:
            herb_list.append(herb_info)
            print(f"    âœ“ Successfully extracted information")
        else:
            print(f"    âœ— Could not extract information")
        
        # Wait 2 seconds before visiting the next page (being polite to the server)
        time.sleep(2)
    
    return herb_list


# Step 4: Define function to get details from each herb's page
def get_herb_details(url, herb_name):
    """
    This function visits an individual herb's page and extracts all the details.
    
    Args:
        url: The web address of the herb's page
        herb_name: The name of the herb
    
    Returns:
        A dictionary (like a mini-database) with all the herb information
    """
    
    try:
        # Visit the herb's page
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            return None
        
        # Parse the HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Create a dictionary to store all information
        herb_data = {
            'Plant_Name': herb_name,
            'Source': 'WebMD',
            'URL': url,
            'Scientific_Name': '',
            'Other_Names': '',
            'Target_Function': '',
            'Activity_Level': '',
            'Part_Used': '',
            'Key_Phytochemicals': '',
            'Extraction_Method': '',
            'Dosage_Range': '',
            'Contraindications': ''
        }
        
        # Step 4.1: Look for "Also Known As" section (contains other names)
        # We search for text that contains "Also Known As"
        aka_heading = soup.find(text=lambda t: t and 'Also Known As' in t)
        if aka_heading:
            # Get the parent element (the container holding this text)
            aka_section = aka_heading.find_parent()
            if aka_section:
                # Get all the text and clean it up
                aka_text = aka_section.get_text(strip=True)
                # Remove the "Also Known As:" part
                aka_text = aka_text.replace('Also Known As:', '').strip()
                herb_data['Other_Names'] = aka_text[:500]  # Limit to 500 characters
        
        # Step 4.2: Look for scientific name
        # Often in italics or in the "Also Known As" section
        scientific_names = soup.find_all('i')  # Italic text often contains scientific names
        for name in scientific_names:
            name_text = name.get_text(strip=True)
            # Scientific names have format: Genus species
            if len(name_text.split()) == 2 and name_text[0].isupper():
                herb_data['Scientific_Name'] = name_text
                break
        
        # Step 4.3: Look for "Uses" section
        uses_heading = soup.find(['h2', 'h3'], text=lambda t: t and 'Uses' in t)
        if uses_heading:
            # Get the content after this heading
            uses_content = uses_heading.find_next(['p', 'div', 'ul'])
            if uses_content:
                uses_text = uses_content.get_text(strip=True)
                herb_data['Target_Function'] = uses_text[:1000]  # Limit to 1000 characters
        
        # Step 4.4: Look for "How does it work?" section (contains phytochemicals/mechanism)
        how_heading = soup.find(['h2', 'h3'], text=lambda t: t and 'How does it work' in t)
        if how_heading:
            how_content = how_heading.find_next(['p', 'div'])
            if how_content:
                how_text = how_content.get_text(strip=True)
                herb_data['Key_Phytochemicals'] = how_text[:1000]
        
        # Step 4.5: Look for dosing information
        dosing_heading = soup.find(['h2', 'h3'], text=lambda t: t and 'Dosing' in t)
        if dosing_heading:
            dosing_content = dosing_heading.find_next(['p', 'div', 'ul'])
            if dosing_content:
                dosing_text = dosing_content.get_text(strip=True)
                herb_data['Dosage_Range'] = dosing_text[:1000]
        
        # Step 4.6: Look for safety/interactions/side effects
        # Check multiple possible headings
        safety_keywords = ['Side Effects', 'Safety', 'Interactions', 'Precautions', 'Warnings']
        for keyword in safety_keywords:
            safety_heading = soup.find(['h2', 'h3'], text=lambda t: t and keyword in t)
            if safety_heading:
                safety_content = safety_heading.find_next(['p', 'div', 'ul'])
                if safety_content:
                    safety_text = safety_content.get_text(strip=True)
                    # Append to contraindications (may find multiple sections)
                    if herb_data['Contraindications']:
                        herb_data['Contraindications'] += ' | ' + safety_text[:500]
                    else:
                        herb_data['Contraindications'] = safety_text[:500]
        
        return herb_data
    
    except Exception as e:
        print(f"    Error: {e}")
        return None


# Step 5: Main execution
if __name__ == "__main__":
    print("="*70)
    print("SIMPLE WEBMD MEDICINAL PLANT SCRAPER")
    print("="*70)
    print("\nThis script will:")
    print("1. Visit WebMD's herb list (letter A)")
    print("2. Get the first 5 herbs")
    print("3. Extract detailed information for each herb")
    print("4. Save everything to a CSV file")
    print("\n" + "="*70 + "\n")
    
    # Get the list of herbs with their details
    herbs = get_herb_list()
    
    # Step 6: Save to CSV (like Excel)
    if herbs:
        # Convert list of dictionaries to a DataFrame (table)
        df = pd.DataFrame(herbs)
        
        # Save to CSV file
        filename = 'webmd_herbs_simple.csv'
        df.to_csv(filename, index=False)
        
        print("\n" + "="*70)
        print(f"SUCCESS! Found {len(herbs)} herbs")
        print(f"Data saved to: {filename}")
        print("\n" + "="*70)
        
        # Show a preview of what we got
        print("\nPREVIEW OF DATA:")
        print("-"*70)
        for herb in herbs:
            print(f"\nðŸŒ¿ {herb['Plant_Name']}")
            if herb['Scientific_Name']:
                print(f"   Scientific: {herb['Scientific_Name']}")
            if herb['Target_Function']:
                uses = herb['Target_Function'][:100] + "..." if len(herb['Target_Function']) > 100 else herb['Target_Function']
                print(f"   Uses: {uses}")
        
        print("\n" + "="*70)
        print("\nYou can now open 'webmd_herbs_simple.csv' in Excel!")
        
    else:
        print("\nâœ— No herbs were scraped. There may be a problem with the website.")
        print("Try visiting the URL manually to check if it's accessible.")

SIMPLE WEBMD MEDICINAL PLANT SCRAPER

This script will:
1. Visit WebMD's herb list (letter A)
2. Get the first 5 herbs
3. Extract detailed information for each herb
4. Save everything to a CSV file


Getting list of herbs from WebMD...
âœ— Problem: Got status code 404

âœ— No herbs were scraped. There may be a problem with the website.
Try visiting the URL manually to check if it's accessible.
