In [1]:
import os
import json
import re
import time
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


def wrap_latex_if_needed(text):
    stripped = text.strip()
    if re.match(r"^[-+*/()=0-9a-zA-Z.\\^ ]+$", stripped):
        return f"$$ {stripped} $$"
    return f"$ {stripped} $"

def convert_mml_to_latex(text):
    soup = BeautifulSoup(text, "html.parser")
    for math in soup.select("mjx-assistive-mml"):
        if math.has_attr("alttext"):
            latex = wrap_latex_if_needed(math["alttext"].strip())
            math.insert_before(latex)
            math.decompose()
    return soup.get_text(" ", strip=True)

tom = ["Easy", "Medium", "Hard"]
collegeboard = ["Emerging", "Proficient", "Advanced"]

difficulty = dict(zip(collegeboard, tom))

# Setup and login instructions
options = Options()
options.add_argument('--start-maximized')
driver = webdriver.Chrome(service=Service(), options=options)

# Open the desired webpage
driver.get('https://apclassroom.collegeboard.org/33/question_bank/questions')
wait = WebDriverWait(driver, 15)

# Enter the email address
email_input = wait.until(EC.element_to_be_clickable((By.ID, 'input28')))
email_input.send_keys('tabraham@thsrocks.us')
email_input.send_keys(u'\ue007')  # Press Enter key

select = wait.until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[2]/main/div[2]/div/div/div[2]/form/div[2]/div/div[2]/div[2]/div[2]/a')))
select.send_keys(u'\ue007')  # Press Enter key

# Wait and input the password
password_input = wait.until(EC.element_to_be_clickable((By.ID, 'input83')))
password_input.send_keys('Ayrab711!')
password_input.send_keys(u'\ue007')  # Press Enter key

# MCQ Page
time.sleep(10)  # Adjust time as needed
driver.get('https://apclassroom.collegeboard.org/33/question_bank/questions?tags=%2722737%27%3A%21%28356755%29')
wait = WebDriverWait(driver, 15)

select = wait.until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[1]/div/div[4]/div/div[2]/main/div[4]/div[2]/div[2]/div[1]/div/div/table/tbody/tr[1]/td[1]/span/div/a')))
select.send_keys(u'\ue007')  # Press Enter key

select = wait.until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[1]/div/div[4]/div/div[2]/main/div[4]/div[2]/div[5]/div/div/div[2]/div/div[2]/div/div[2]/div[1]/span')))
select.click()  # Press Enter key



In [4]:
def scrape_question(driver, wait):
    
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.ItemMetadata")))
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.lrn-assess-content")))
    
    output = {
    "question_id": "",
    "unit": "",
    "topics": [],    
    "title": "",
    "difficulty": "",
    "question_text": "",
    "image_files": [],
    "tables": [],
    "choices": [],
    "correct_answer": "",
    "solution": "",
    "explanations": [],
    "tags": [],
    }

    title_words = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[4]/div/div[2]/main/div[4]/div[2]/div[5]/div/div/div[1]/h2/span/span').text
    meta = driver.find_element(By.CSS_SELECTOR, "div.ItemMetadata")
    view = driver.find_element(By.CSS_SELECTOR, "div.lrn-assess-content")
    ul = meta.find_elements(By.TAG_NAME, "ul")
    item_details = ul[0].text.split('\n')
    unit_topic = ul[1].text.split('\n')
    diff = ul[-1].text.split(' ')[2]
    stimulus = item_details[1].split(" ")[2]
    if stimulus == "Data Set":
        return
    folder = 'unit'+unit_topic[0][0]
    question_id = f"question-{len([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]):03d}"
    choices = driver.find_elements(By.CSS_SELECTOR, "li.lrn-mcq-option")
    correct = driver.find_element(By.CSS_SELECTOR, "div.teacher-item-preview")
    distractors = correct.find_elements(By.CSS_SELECTOR, "p.rationale_paragraph")
    
    output['question_id'] = question_id
    output['unit'] = unit_topic[0]
    output['topics'] = unit_topic[1:]
    output['title'] = " ".join(title_words.split(" ")[1:])
    output['difficulty'] = difficulty[diff]
    output['question_text'] = "\n".join([convert_mml_to_latex(p.text) for p in driver.find_elements(By.CSS_SELECTOR, "p.passage_para") + driver.find_elements(By.CSS_SELECTOR, "p.stem_paragraph")])
    letters = ['A', 'B', 'C', 'D', 'E']
    for choice, letter in zip(choices, letters):
        text = convert_mml_to_latex(choice.find_element(By.CSS_SELECTOR, "p.choice_paragraph").text)
        output['choices'].append({"letter": letter, "text": text})
    output['correct_answer'] = driver.execute_script("return arguments[0].textContent;", correct.find_element(By.TAG_NAME, "h3"))
    for d in distractors:
        text = convert_mml_to_latex(d.text)
        output['solution'] += text
    for d in distractors:
        text = convert_mml_to_latex(driver.execute_script("return arguments[0].textContent;", d))
        output['explanations'].append(text)
    output['tags'] = ul[7].text.split('\n')
    
    image_folder = os.path.join(folder, "images")
    os.makedirs(image_folder, exist_ok=True)
    

    tables = driver.find_elements(By.CSS_SELECTOR, "div.table_wrapper")
    for table in tables:
        html = table.get_attribute("innerHTML")
        output["tables"].append(convert_mml_to_latex(html))
        
    images = view.find_elements(By.TAG_NAME, "img")
    for i, img in enumerate(images):
        img_url = img.get_attribute("src")
        ext = os.path.splitext(urlparse(img_url).path)[1] or ".png"
        local_filename = f"{question_id}-img{i+1}{ext}"
        local_path = os.path.join(image_folder, local_filename)

        try:
            img_data = requests.get(img_url).content
            with open(local_path, "wb") as f:
                f.write(img_data)
            output["image_files"].append(f"images/{local_filename}")
            print(f"🖼 Downloaded: {local_filename}")
        except Exception as e:
            print(f"⚠️ Image download failed: {e}")
            

    os.makedirs(folder, exist_ok=True)
    out_path = f"{folder}/{question_id}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved {question_id} to {out_path}")
    
    next_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-test-id='next-button']")))
    next_btn.click()

In [None]:
i = 1
total_questions = 967
while total_questions is None or i <= total_questions:
    question_id = f"question-{i:03d}"
    print(f"\n🧠 Scraping {question_id} of {total_questions if total_questions else '???'}")
    success = scrape_question(driver, wait)

    if not success:
        print("✅ Finished scraping all questions or hit end.")
        break

    i += 1