In [38]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from urllib.parse import urljoin


In [39]:
fileName = "AwsMl/8"
# URL of the page with all questions
url = "http://127.0.0.1:5500/SourceDump/"+fileName+".html" 

# Folder where images will be saved
image_folder = "/wp-content/uploads/examdump/awsML/"
os.makedirs(image_folder, exist_ok=True)  # Create folder if it doesn't exist

# Fetch the HTML content from the local server
response = requests.get(url)
html = response.content

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Prepare lists to store data
data_list = []


In [40]:
def download_image(image_url, image_folder, image_name):
    try:
        # Construct full image URL in case it's relative
        full_image_url = urljoin(url, image_url)  # Use urljoin to handle relative URLs
        img_response = requests.get(full_image_url)
        
        if img_response.status_code == 200:
            img_path = os.path.join(image_folder, image_name)
            
            # Save the image locally
            with open(img_path, 'wb') as f:
                f.write(img_response.content)
            return img_path
        else:
            print(f"Failed to download {full_image_url}: Status code {img_response.status_code}")
            return None
    except Exception as e:
        print(f"Failed to download {image_url}: {e}")
        return None
    

In [41]:
# Function to extract questions
def extract_questions():
    questions = soup.find_all('div', class_='card exam-question-card')
    
    for question in questions:
        # Extract question number
        question_no = question.find('div', class_='card-header').text.split()[1]
        question_no = question_no.replace("#", "")

        # Extract question text with HTML formatting and remove extra spaces
        question_body = question.find('div', class_='card-body').find('p', class_='card-text')
        
        # Convert the question body to a string and remove excessive spaces/newlines
        question_text_html = str(question_body).replace('\n', '').replace('<p class="card-text">', '').replace('</p>', '')
        
        # Remove leading/trailing spaces and normalize multiple spaces to a single space within the text
        question_text_html = " ".join(question_text_html.split())
        
        # Extract image (if any)
        question_image_tag = question.find('img', class_='in-exam-image')
        question_image = None
        if question_image_tag:
            image_url = question_image_tag['src']
            image_name = f"question_{question_no}.jpg"
            # Download the image and get its local path
            question_image = download_image(image_url, image_folder, image_name)
        
        question_text_html = re.sub(r'src="[^"]+"', f'src="{question_image}"', question_text_html)

        # Extract options dynamically
        options = []
        for li in question.find_all('li', class_='multi-choice-item'):
            # Extract the option letter and text
            option_text = li.get_text(separator=" ", strip=True)  # Get the entire text with space separators
            
            # Clean up unwanted parts like 'Most Voted'
            option_text = option_text.replace('Most Voted', '').strip()
            
            # Remove the letter and trailing dot (like "A.", "B.")
            option_text = option_text[3:].strip()  # Removes the first 3 characters ("A. ")
            
            options.append(option_text)
        
        # Extract correct answer(s) from outside the <li> elements
        correct_answer_block = question.find('span', class_='correct-answer-box')
        correct_answers = []
        if correct_answer_block:
            correct_answers_text = correct_answer_block.find('span', class_='correct-answer').get_text(strip=True)
            correct_answers_text = ",".join(correct_answers_text)

            correct_answers.append(correct_answers_text)
        
        #print(correct_answers)
        # Extract answer image (if any)
        answer_image_tag = question.find('img', class_='answer-image')
        answer_image = None
        if answer_image_tag:
            answer_image_url = answer_image_tag['src']
            answer_image_name = f"answer_{question_no}.jpg"
            # Download the answer image and get its local path
            answer_image = download_image(answer_image_url, image_folder, answer_image_name)
        
        # Prepare the data for the DataFrame, accommodating all options dynamically
        data = {
            "Question no.": question_no,
            "Question Text (HTML)": question_text_html,
            "Question image": question_image if question_image else "No image",
            "Correct answer(s)": "".join(correct_answers),  
            "Answer image": answer_image if answer_image else "No image",
        }
        
        # Dynamically add options to the data dictionary
        for i, option in enumerate(options, start=1):
            data[f"Option {i}"] = option
        
        data_list.append(data)

In [42]:
# Define the file name
file_name = 'questions_data.xlsx'

# Check if the Excel file already exists
if os.path.exists(file_name):
    # Load the existing data
    existing_df = pd.read_excel(file_name)
    
    # Extract questions from the single page
    data_list = []
    extract_questions()
    
    # Convert new data to a DataFrame
    new_df = pd.DataFrame(data_list)
    
    # Append the new data to the existing DataFrame
    df = pd.concat([existing_df, new_df], ignore_index=True)
    
    # Save the combined data to the Excel file
    with pd.ExcelWriter(file_name, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
        df.to_excel(writer, index=False)
else:
    # Extract questions from the single page
    data_list = []
    extract_questions()

    # Convert data to DataFrame and save to a new Excel file
    df = pd.DataFrame(data_list)
    df.to_excel(file_name, index=False)

print(f"Data successfully written to {file_name}")

Data successfully written to questions_data.xlsx
