In [90]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from openpyxl import Workbook


def find_topics_url(base_url):
    response = requests.get(base_url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all div elements with class "col-sm-6"
    col_sm_6_divs = soup.find_all('div', class_='col-sm-6')

    # List to store the links
    links = []

    # Loop through each "col-sm-6" div to find anchor tags and extract their URLs
    for div in col_sm_6_divs:
        anchor_tag = div.find('a', href=True)
        if anchor_tag:
            href = anchor_tag['href']
            link_text = anchor_tag.get_text(strip=True)
            links.append((href, link_text))
    return links 


def process_siblings(heading, section_content):
    sibling = heading.find_next_sibling()
    while sibling:
        if sibling.name == 'ul':
            # Extract all list items from the unordered list
            li_elements = sibling.find_all('li')
            for li in li_elements:
                section_content['details'].append(li.get_text(strip=True))
        elif sibling.name == 'p':
            # Extract the paragraph text
            section_content['details'].append(sibling.get_text(strip=True))
        # elif sibling.name == "h3":
        #     section_content['details'].append(sibling.get_text(strip=True))
        #     section_content = process_siblings(sibling, section_content)
        elif sibling.name in ['h2', 'h3']:
            # Stop if we encounter another heading
            break
        sibling = sibling.find_next_sibling()
    return section_content

def scrape_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all the headings (h2, h3) and corresponding <ul> lists and <p> tags below them
    content = []

    # Extract sections based on headings (h2, h3)
    headings = soup.find_all(['h2'])#, 'h3'])
    for heading in headings:
        title = heading.get_text(strip=True)
        section_content = {
            'title': title,
            'details': []
        }
        for sibling in heading.find_next_siblings():
            section_content['details'].append(sibling.get_text())
        # section_content = {
        #     'title': title,
        #     'details': []
        # }
        #section_content = process_siblings(heading, section_content)
        content.append(section_content)
    return content

def filter_extracted_data(data):
    final_data = []
    for entry in data: 
        if len(entry.get('details')) == 0:
            continue
        else:
            entry['details'] = '\n'.join(entry['details'])
        final_data.append(entry)
    return final_data

base_url = "https://www.radiologyinfo.org/en/onco"
topic_urls = find_topics_url(base_url)



final_df = pd.DataFrame(columns=["topic_name", "Question", "Answer"])

total = 0
for topic_url, _ in topic_urls:
    topic_name = topic_url.split('/')[-1]
    print(f"Topic Name : {topic_name} -- Topic Url : {topic_url}")
    extracted_data = scrape_content(topic_url)
    extracted_data = filter_extracted_data(extracted_data)

    topic_df = pd.DataFrame(extracted_data).rename({"title": "Question", "details" : "Answer"}, axis=1)
    topic_df['topic_name'] = [topic_name] * len(topic_df)
    topic_df = topic_df[['topic_name', 'Question', 'Answer']]
    total += topic_df.shape[0]
    final_df = pd.concat([final_df, topic_df])

Topic Name : anal-cancer-therapy -- Topic Url : https://www.radiologyinfo.org/en/info/anal-cancer-therapy
Topic Name : lung-cancer-therapy -- Topic Url : https://www.radiologyinfo.org/en/info/lung-cancer-therapy
Topic Name : brachy -- Topic Url : https://www.radiologyinfo.org/en/info/brachy
Topic Name : lymphoma-cancer-therapy -- Topic Url : https://www.radiologyinfo.org/en/info/lymphoma-cancer-therapy
Topic Name : thera-brain -- Topic Url : https://www.radiologyinfo.org/en/info/thera-brain
Topic Name : mri-guided-linac -- Topic Url : https://www.radiologyinfo.org/en/info/mri-guided-linac
Topic Name : breast-cancer-therapy -- Topic Url : https://www.radiologyinfo.org/en/info/breast-cancer-therapy
Topic Name : pancreatic-cancer-treatment -- Topic Url : https://www.radiologyinfo.org/en/info/pancreatic-cancer-treatment
Topic Name : cervical-cancer-therapy -- Topic Url : https://www.radiologyinfo.org/en/info/cervical-cancer-therapy
Topic Name : professions-radiation-therapy -- Topic Url : 

In [97]:
print(final_df.iloc[1]['Answer'].strip())

Treatment options overview
How can I choose from among the options?
If I receive radiation therapy, will surgery still be required?
How effective is modern radiation treatment of anal cancer?


Treatment options overview
Anal cancer is highly treatable when found early. Treatment options depend on the:

type of cancer cell present
stage of the cancer
tumor location
patient's human immunodeficiency virus (HIV) status
recurrence of the cancer following treatment
patient's preference and overall health

There are three types of standard treatment for anal cancer: 

Radiation therapy—a cancer treatment that uses high-energy x-rays or other types of radiation to kill cancer cells. The type of radiation therapy used to treat anal cancer is:
    
external beam therapy, in which high-energy x-ray beams generated by a machine are directed at the tumor from outside the body (usually by a linear accelerator) and targeted at the tumor site. These x-rays can destroy the cancer cells and careful tre

In [86]:
#url = "https://www.radiologyinfo.org/en/info/esophageal-cancer-therapy"
url = "https://www.radiologyinfo.org/en/info/anal-cancer-therapy"
#anal-cancer-therapy
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

headings = soup.find_all(['h2'])
print(headings[0].get_text(strip=True))
for x in headings[0].find_next_siblings():
    print(x.get_text())

Anal cancer overview
Anal cancer is a cancer that begins in the anus, the opening at the end of the gastrointestinal tract through which stool, or solid waste, leaves the body. The anus begins at the bottom of the rectum, which is the last part of the large intestine (also called the colon).
Anal cancer usually affects adults over age 60 and women more often than men. 


In [57]:
final_df.head(40)

Unnamed: 0,topic_name,Question,Answer
0,anal-cancer-therapy,Anal cancer overview,Anal cancer is a cancer that begins in theanus...
1,anal-cancer-therapy,Treatment options overview,Anal cancer is highly treatable when found ear...
2,anal-cancer-therapy,How can I choose from among the options?,The team of physicians responsible for your ca...
3,anal-cancer-therapy,"If I receive radiation therapy, will surgery s...",The team of physicians responsible for your ca...
4,anal-cancer-therapy,How effective is modern radiation treatment of...,Anal cancer is generally responsive to radiati...
5,anal-cancer-therapy,What happens during radiation therapy?,Radiation therapy uses high energy x-rays (pho...
6,anal-cancer-therapy,What are possible side effects of radiation th...,Side effects of radiation treatment include pr...
7,anal-cancer-therapy,What kind of treatment follow-up should I expect?,"After your treatment has ended, your physician..."
8,anal-cancer-therapy,Are there any new developments in treating my ...,New treatments for anal cancer are being studi...
9,anal-cancer-therapy,Clinical Trials,For information and resources about clinical t...


In [58]:
final_df[final_df.topic_name == "esophageal-cancer-therapy"]

Unnamed: 0,topic_name,Question,Answer
0,esophageal-cancer-therapy,What is Esophageal Cancer?,Esophageal cancer develops in the lining of th...
1,esophageal-cancer-therapy,What are my treatment options?,Treatment options include:\nYour treatment pla...
2,esophageal-cancer-therapy,Surgery,Your doctor may use surgery alone for early-st...
3,esophageal-cancer-therapy,Endoscopic Treatments,Endoscopic treatments treat early and pre-canc...
4,esophageal-cancer-therapy,Chemotherapy,This treatment stops cancer cells from dividin...
5,esophageal-cancer-therapy,Monoclonal Antibody Therapy(also called target...,A small number of esophageal cancers have too ...
6,esophageal-cancer-therapy,Immunotherapy,This treatment helps activate the body’s own i...
7,esophageal-cancer-therapy,Radiation therapy,This treatment uses radiation to kill cancer c...
8,esophageal-cancer-therapy,What happens during radiation therapy?,"Before starting radiation therapy, patients wh..."
9,esophageal-cancer-therapy,What are possible side effects of radiation th...,Radiation treatment can cause side effects. Th...


In [3]:
import re

def extract_sub_questions_and_answers(text):
    # Split the text into lines
    lines = text.split('\n')
    
    questions_and_answers = []
    current_question = None
    current_answer = []
    
    # Treat the text before the first question as a separate Q&A
    first_line = lines[0].strip()
    first_question = first_line


    is_first = False
    # Iterate over each line to extract questions and answers
    for ind, line in enumerate(lines[1:]):
        # Remove extra whitespace and line breaks
        line = line.strip()

        # Check if the line is a question (ends with a question mark)
        if line.endswith('?'):
            if not is_first:
                is_first = True
                first_answer = "\n".join(lines[1:ind])
                questions_and_answers.append((first_question, first_answer))
            # If there is a current question, save the question and its answer
            if current_question:
                questions_and_answers.append((current_question, '\n'.join(current_answer)))
                current_answer = []

            # Set the new current question
            current_question = line
        else:
            # If it's not a question, it might be part of an answer
            if current_question:
                current_answer.append(line)

    # Add the last question-answer pair
    if current_question:
        questions_and_answers.append((current_question, ' '.join(current_answer)))

    return questions_and_answers

In [30]:
for line in final_df.iloc[1]['Answer'].split('\n'):
    if line.endswith('?'):
        print(line)

How can I choose from among the options?
If I receive radiation therapy, will surgery still be required?
How effective is modern radiation treatment of anal cancer?
How effective is modern radiation treatment of anal cancer?
If I receive radiation therapy, will surgery still be required?
How effective is modern radiation treatment of anal cancer?
How effective is modern radiation treatment of anal cancer?
How can I choose from among the options?
If I receive radiation therapy, will surgery still be required?
How effective is modern radiation treatment of anal cancer?
How effective is modern radiation treatment of anal cancer?
If I receive radiation therapy, will surgery still be required?
How effective is modern radiation treatment of anal cancer?
How effective is modern radiation treatment of anal cancer?


In [11]:
final_rows = []
for idx,row in final_df.iterrows():
    # Check if the row has more questions
    if len(row['Answer'].split('?')) > 1:
        # Split the row into multiple rows
        qa = extract_sub_questions_and_answers(row['Answer'])
        sub_rows = []
        for item in qa:
            sub_rows.append({
                "topic_name" : row['topic_name'], 
                "Question" : item[0], 
                "Answer" : item[1]
            })
        final_rows.extend(sub_rows)
    else:
        final_rows.append({
            "topic_name" : row['topic_name'],
            "Question" : row['Question'], 
            "Answer" : row["Answer"]
        })


In [12]:
final_df.head()

Unnamed: 0,topic_name,Question,Answer
0,anal-cancer-therapy,Anal cancer overview,Anal cancer is a cancer that begins in theanus...
1,anal-cancer-therapy,What are my treatment options?,Treatment options overview\nAnal cancer is hig...
2,anal-cancer-therapy,What happens during radiation therapy?,Radiation therapy uses high energy x-rays (pho...
3,anal-cancer-therapy,What are possible side effects of radiation th...,Side effects of radiation treatment include pr...
4,anal-cancer-therapy,What kind of treatment follow-up should I expect?,"After your treatment has ended, your physician..."


In [101]:
final_df  = final_df[final_df.Question != "Send us your feedback"]
final_df.shape

(390, 3)

In [104]:
final_df = final_df[final_df.Question != "Additional Information and Resources"]
final_df.shape[0]

368

In [102]:
import os 

os.makedirs("../data", exist_ok=True)
final_df.to_csv("../data/radiologyinfo-QA-extracted-uncleaned-v2.csv", index=False)