In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse

In [12]:
def scrape_website(url):
    # Send a request to the given URL
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find main content
    main_content = soup.find('div', class_='text-ckeditor aem-GridColumn aem-GridColumn--default--12')
    if not main_content:
        print(f"Main content section not found for {url}")
        return None

    data = []

    # Loop through headers h2, h3, h4 to get questions
    for header in main_content.find_all(['h2', 'h3', 'h4']):
        question = header.get_text(strip=True)
        answer = []

        # Collect all relevant paragraphs or list items that follow the header
        for sibling in header.find_next_siblings():
            if sibling.name in ['h2', 'h3', 'h4']:
                break  # Stop at the next header
            
            if sibling.name == 'p':
                answer.append(sibling.get_text(strip=True))
            elif sibling.name == 'ul':
                for li in sibling.find_all('li'):
                    answer.append(li.get_text(strip=True))

        # Append the question and answer to the data list
        data.append({"question": question, "answer": ' '.join(answer)})

    return data

In [13]:
url_list = [
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/external-beam-radiation-therapy.html",
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/internal-radiation-therapy-brachytherapy.html",
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/systemic-radiation-therapy.html",
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/photodynamic-therapy.html",
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/effects-on-different-parts-of-body.html",
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/safety.html"
]

In [15]:
for url in url_list:
    data = scrape_website(url)
    if data:
        # Parse the domain name and path for the CSV file name
        domain_name = urlparse(url).netloc.replace("www.", "")
        path_name = urlparse(url).path.split('/')[-1] or 'default'
        csv_filename = f"{domain_name}_{path_name}.csv"
        
        # Convert data to DataFrame and save to CSV
        df = pd.DataFrame(data)
        df.to_csv(csv_filename, index=False)
        
        print(f"Data has been scraped and saved to '{csv_filename}'")
    else:
        print(f"No data found for {url}")

No data found for https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/external-beam-radiation-therapy.html
No data found for https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/internal-radiation-therapy-brachytherapy.html
Data has been scraped and saved to 'cancer.org_systemic-radiation-therapy.html.csv'
Data has been scraped and saved to 'cancer.org_photodynamic-therapy.html.csv'
No data found for https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/effects-on-different-parts-of-body.html
No data found for https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/safety.html
