In [1]:
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

In [2]:
def get_url_data(driver, url):
    # 打开目标网页
    driver.get(url)

    # 查找所有问题的容器
    questions = driver.find_elements(By.CSS_SELECTOR, '.question_holder')

    all_qa_pairs = []

    for question in questions:
        is_right_answer = "Yes"
        # 检查问题是否被标记为incorrect
        incorrect_marks = question.find_elements(By.CSS_SELECTOR, '.answer_arrow.incorrect')
        
        # 如果存在incorrect标记，则跳过这个问题
        if incorrect_marks:
            is_right_answer = "No"
        
        # 提取问题文本
        question_text = question.find_element(By.CSS_SELECTOR, '.question_text.user_content.enhanced').text.strip()
        
        # 尝试找到被选中的答案选项
        try:
            selected_answer = question.find_element(By.CSS_SELECTOR, 'input[type="radio"][checked]').find_element(By.XPATH, './following-sibling::label').text.strip()
        except Exception as e:
            # 如果没有找到被选中的答案，可能需要记录错误或以其他方式处理
            selected_answer = '未找到选中的答案'
        
        # 将问题和答案加入到列表中
        all_qa_pairs.append((question_text, selected_answer, is_right_answer))

    df = pd.DataFrame(all_qa_pairs, columns=['Question', 'Answer', 'Right'])

    return df

In [3]:
target_folder = "glg_cha15"

html_folder = Path.cwd() / f"targets/{target_folder}"
html_files = list(html_folder.glob("*.html"))
html_files

[PosixPath('/Users/scott/repos/jupyter_notebooks/extract_cavas/targets/glg_cha15/CHA15_ GLG101_ Intro to Geology (2024 Spring) (4_28_2024 10_31_16 PM).html')]

In [4]:
result_arr = []

with webdriver.Chrome() as driver:
    for file in html_files:
        file_url = file.as_uri()
        df = get_url_data(driver=driver, url=file_url)
        df['charpter'] = file.stem
        result_arr.append(df)

In [5]:
dff = pd.concat(result_arr, ignore_index=True)
# dff.sort_values(by='Question').to_excel(html_folder.joinpath("results.xlsx"), index=False)


In [6]:
dff_r = dff[dff.Right == 'Yes'].drop_duplicates(subset=['Question'])
dff_w = dff[dff.Right != 'Yes'].drop_duplicates(subset=['Question'])

In [7]:
dff_w_remain = dff_w[~dff_w['Question'].isin(dff_r['Question'])]

In [8]:
with pd.ExcelWriter(html_folder.joinpath("result.xlsx")) as writer:
    dff_r[['Question', 'Answer', 'charpter']].sort_values(by='Question').to_excel(writer, index=False, sheet_name='Right')
    dff_w_remain[['Question', 'Answer', 'charpter']].sort_values(by='Question').to_excel(writer, index=False, sheet_name="Wrong")

In [9]:
dff_r[['Question', 'Answer']].to_csv(html_folder.joinpath("result.csv"), index=False, sep='|')
dff_r[['Question', 'Answer']].to_csv(html_folder.parent.joinpath("result.csv"), index=False, sep='|')