In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
import re

In [9]:
def scrape_data(url):
    # Send a request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    data = []
    
    # Find all the headings and their corresponding paragraphs
    headings = soup.find_all(['h2', 'h3', 'h4'])  # Adjust the heading tags as per the structure of the website
    for heading in headings:
        # Find the next sibling which is a paragraph or series of paragraphs
        next_sibling = heading.find_next_sibling()
        answer = ''
        
        # Collect all the text within paragraphs until we reach another heading
        while next_sibling and next_sibling.name not in ['h2', 'h3', 'h4']:
            if next_sibling.name == 'p':
                answer += next_sibling.get_text(strip=True) + " "
            next_sibling = next_sibling.find_next_sibling()
        
        # Append the question and the corresponding answer to the data list
        data.append({
            'Question': heading.get_text(strip=True),
            'Answer': answer.strip()
        })
    
    return data

In [10]:
def create_filename_from_url(url):
    # Remove special characters and limit the filename length
    filename = re.sub(r'[^\w]', '_', url)[:100] + '.csv'
    return filename

In [11]:
def scrape_multiple_sites(url_list):
    for url in url_list:
        try:
            data = scrape_data(url)
            df = pd.DataFrame(data)
            
            # Create a filename based on the URL
            output_file = create_filename_from_url(url)
            
            # Save the data into a separate CSV file
            df.to_csv(output_file, index=False)
            print(f"Data from {url} saved to {output_file}")
        except Exception as e:
            print(f"Error scraping {url}: {e}")

In [12]:
url_list = [
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/external-beam-radiation-therapy.html",
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/internal-radiation-therapy-brachytherapy.html",
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/effects-on-different-parts-of-body.html",
    "https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/safety.html"
]

In [13]:
scrape_multiple_sites(url_list)

Data from https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/external-beam-radiation-therapy.html saved to https___www_cancer_org_cancer_managing_cancer_treatment_types_radiation_external_beam_radiation_ther.csv
Data from https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/internal-radiation-therapy-brachytherapy.html saved to https___www_cancer_org_cancer_managing_cancer_treatment_types_radiation_internal_radiation_therapy_b.csv
Data from https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/effects-on-different-parts-of-body.html saved to https___www_cancer_org_cancer_managing_cancer_treatment_types_radiation_effects_on_different_parts_o.csv
Data from https://www.cancer.org/cancer/managing-cancer/treatment-types/radiation/safety.html saved to https___www_cancer_org_cancer_managing_cancer_treatment_types_radiation_safety_html.csv
