In [1]:
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
file_path = "D:\CS\ITexamAZ_900.html"

with open(file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

In [8]:
soup = BeautifulSoup(html_content, "html.parser")

# Extracting the data
cards = soup.find_all("div", class_="card")

data = []

for card in cards:
    question_no = card.find("h5").text.strip().split()[1]
    
    question_text_div = card.find("div", class_="question_text")
    question_text = question_text_div.text.strip()
    
    # Extract question image, if present
    question_img_tag = question_text_div.find("img")
    question_img = question_img_tag['src'] if question_img_tag else ''
    
    # Extract answer text or image
    answer_block = card.find("div", class_="answer_block")
    answer_text = answer_block.text.strip()
    answer_img_tag = answer_block.find("img")
    answer_text_or_img = answer_img_tag['src'] if answer_img_tag else answer_text
    
    # Extract references, if present
    references = ""
    reference_tag = answer_block.find("p")
    if reference_tag and "References" in reference_tag.text:
        references = reference_tag.text.split("References:")[1].strip()
    
    # Extract options if available
    options = card.find_all("li", class_="choices-list")
    option_list = [option.text.strip() for option in card.find_all("li")]
    
    # Handle missing options by setting empty strings
    option1 = option_list[0] if len(option_list) > 0 else ''
    option2 = option_list[1] if len(option_list) > 1 else ''
    option3 = option_list[2] if len(option_list) > 2 else ''
    option4 = option_list[3] if len(option_list) > 3 else ''
    option5 = option_list[4] if len(option_list) > 4 else ''


    # Append data for each card
    data.append({
        "Question No.": question_no,
        "Question Text": question_text,
        "Question Img": question_img,
        "Option 1": option1,
        "Option 2": option2,
        "Option 3": option3,
        "Option 4": option4,
        "Answer Text/Img": answer_text_or_img,
        "References": references,
    })


In [9]:
df = pd.DataFrame(data)

# Display DataFrame content
print(df)

   Question No.                                      Question Text  \
0             1  DRAG DROP -Your company intends to subscribe t...   
1             2  Your company has datacenters in Los Angeles an...   
2             3  Note: The question is included in a number of ...   
3             4  Note: The question is included in a number of ...   
4             5  Note: The question is included in a number of ...   
5             6  Note: The question is included in a number of ...   
6             7  Your developers have created 10 web applicatio...   
7             8  Note: The question is included in a number of ...   
8             9  Your developers have created a portal web app ...   
9            10  Note: The question is included in a number of ...   
10           11  Note: The question is included in a number of ...   
11           12  Note: The question is included in a number of ...   
12           13  Note: The question is included in a number of ...   
13           14  Not

In [11]:
output_file_path = "extracted_data2.xlsx"
df.to_excel(output_file_path, index=False)

print(f"Data successfully saved to {output_file_path}")

Data successfully saved to extracted_data2.xlsx
