In [9]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# URL of the page with all questions
url = "http://127.0.0.1:5500/3.html"

# Fetch the HTML content from the local server
response = requests.get(url)
html = response.content

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Prepare lists to store data
data_list = []

# Function to extract questions
def extract_questions():
    questions = soup.find_all('div', class_='card exam-question-card')
    
    for question in questions:
        # Extract question number
        question_no = question.find('div', class_='card-header').text.split()[1]
        
        # Extract question text with HTML formatting and remove extra spaces
        question_body = question.find('div', class_='card-body').find('p', class_='card-text')
        
        # Convert the question body to a string and remove excessive spaces/newlines
        question_text_html = str(question_body).replace('\n', '').replace('<p class="card-text">', '').replace('</p>', '')
        
        # Remove leading/trailing spaces and normalize multiple spaces to a single space within the text
        question_text_html = " ".join(question_text_html.split())
        
        # Extract image (if any)
        question_image_tag = question.find('img', class_='in-exam-image')
        question_image = question_image_tag['src'] if question_image_tag else None
        
        # Extract options dynamically
        options = []
        for li in question.find_all('li', class_='multi-choice-item'):
            # Extract the option letter and text
            letter = li.find('span', class_='multi-choice-letter').get_text(strip=True)
            option_text = li.get_text(separator=" ", strip=True)  # Get the entire text with space separators
            
            # Clean up unwanted parts like 'Most Voted'
            option_text = option_text.replace('Most Voted', '').strip()
            
            # Remove any excessive spaces from the text, especially between the letter and the option
            full_option = f"{letter} {option_text.split(letter, 1)[1].strip()}"
            full_option = " ".join(full_option.split())  # Ensure no extra spaces in the option
            
            options.append(full_option)
        
        # Extract correct answer(s) from outside the <li> elements
        correct_answer_block = question.find('span', class_='correct-answer-box')
        correct_answers = []
        if correct_answer_block:
            correct_answers_text = correct_answer_block.find('span', class_='correct-answer').get_text(strip=True)
            correct_answers.append(correct_answers_text)
        
        # Extract answer image (if any)
        answer_image_tag = question.find('img', class_='answer-image')
        answer_image = answer_image_tag['src'] if answer_image_tag else None
        
        # Prepare the data for the DataFrame, accommodating all options dynamically
        data = {
            "Question no.": question_no,
            "Question Text (HTML)": question_text_html,
            "Question image": question_image,
            "Correct answer(s)": "; ".join(correct_answers),
            "Answer image": answer_image,
        }
        
        # Dynamically add options to the data dictionary
        for i, option in enumerate(options, start=1):
            data[f"Option {i}"] = option
        
        data_list.append(data)

# Extract questions from the single page
extract_questions()

# Convert data to DataFrame and save to Excel
df = pd.DataFrame(data_list)
df.to_excel('exam_questions_cleaned_dynamic.xlsx', index=False)
print("Data successfully written to 'exam_questions_cleaned_dynamic.xlsx'")


Data successfully written to 'exam_questions_cleaned_dynamic.xlsx'
