In [19]:
from lxml import html
import pandas as pd

# Function to extract data from HTML using XPath
def extract_data_from_html(raw_html, xpath_expressions):
    tree = html.fromstring(raw_html)
    data = {}
    for key, xpath_expression in xpath_expressions.items():        
        if key == 'correction':
            data[key] = [element.split('\'')[3].replace("-" , "") for element in tree.xpath(xpath_expression)]
        
        elif key == 'session':
            data[key] = [element.text_content().strip().split(' ')[1] for element in tree.xpath(xpath_expression)]

        elif key == 'année':
            data[key] = [element.text_content().strip().split(' ')[0].split("  ")[1] for element in tree.xpath(xpath_expression)]

        elif key in ['module', 'sub_module']:
            elements = tree.xpath(xpath_expression)
            if elements:
                module, sub_module = elements[0].text.strip().split("(")
                data['sub_module'] = [module] * len(elements)
                data['module'] = [sub_module[:-1]] * len(elements)
            else:
                data[key] = []

        else:
            elements = tree.xpath(xpath_expression)
            if elements and isinstance(elements[0], html.HtmlElement):
                data[key] = [element.text_content().strip() for element in elements]
            else:
                data[key] = [element.strip() for element in elements]

    return data

# Define your XPath expressions to extract desired data from HTML
xpath_expressions = {
    'question': "//tbody//tr/td/p/b",
    'a' : "//span[contains(@id, 'choixA')]",
    'b' : "//span[contains(@id, 'choixB')]",
    'c' : "//span[contains(@id, 'choixC')]",
    'd' : "//span[contains(@id, 'choixD')]",
    'e' : "//span[contains(@id, 'choixE')]",
    'session' : "//tbody//tr//td//div/span[@class='sess color-blue' and not(contains(@style, 'top'))]",
    'année' : "//tbody//tr//td//div/span[@class='sess color-blue' and not(contains(@style, 'top'))]",
    'correction': "//button[contains(@onclick, 'getCor')]/@onclick",
    'module': "//div[@class='page-title-heading']/div/div",
    'sub_module': "//div[@class='page-title-heading']/div/div"
}

# Read the HTML content from a text file
html_file = 'html_content.txt'
with open(html_file, 'r' , encoding='utf-8') as file:
    raw_html = file.read()

# Extract data using XPath
extracted_data = extract_data_from_html(raw_html, xpath_expressions)

# Print the length of each array
for key, value in extracted_data.items():
    print(f"Length of {key} array: {len(value)}")

# Get the maximum length among all arrays
max_length = max(len(value) for value in extracted_data.values())

# Fill in empty lists with empty strings to match the maximum length
for key, value in extracted_data.items():
    if len(value) == 0:
        extracted_data[key] = [''] * max_length

    elif len(value) < max_length and len(value) > 0:
        extracted_data[key] = [value[0]] * max_length

# Create a DataFrame from the extracted data
df = pd.DataFrame(extracted_data)

# Export DataFrame to Excel
excel_filename = 'extracted_data.xlsx'
df.to_excel(excel_filename, index=False)

print("Data extracted and saved to", excel_filename)


Length of question array: 50
Length of a array: 50
Length of b array: 50
Length of c array: 50
Length of d array: 50
Length of e array: 50
Length of session array: 0
Length of année array: 0
Length of correction array: 50
Length of sub_module array: 1
Length of module array: 1
Data extracted and saved to extracted_data.xlsx
