In [1]:
import os
import json
import re
import time
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def wrap_latex_if_needed(text):
    stripped = text.strip()
    if re.match(r"^[-+*/()=0-9a-zA-Z.\\^ ]+$", stripped):
        return f"$$ {stripped} $$"
    return f"$ {stripped} $"

def convert_mml_to_latex(text):
    soup = BeautifulSoup(text, "html.parser")
    for math in soup.select("mjx-assistive-mml"):
        if math.has_attr("alttext"):
            latex = wrap_latex_if_needed(math["alttext"].strip())
            math.insert_before(latex)
            math.decompose()
    return soup.get_text(" ", strip=True)

def scrape_question(driver, question_id):
    output = {
        "question_id": question_id,
        "unit": "",
        "title": "",
        "difficulty": "",
        "question_text": "",
        "image_files": [],
        "choices": [],
        "correct_answer": "",
        "solution": "",
        "distractor_explanations": {},
        "tags": [],
        "standards": []
    }

    wait = WebDriverWait(driver, 10)

    # Get unit info
    try:
        unit_el = driver.find_element(By.CSS_SELECTOR, ".pv-title-bar__title .paragraph")
        unit_text = unit_el.text.strip()
        match = re.search(r"Unit (\d+)", unit_text)
        if match:
            unit_num = match.group(1)
            output["unit"] = unit_text
            unit_folder = f"unit{unit_num}"
        else:
            unit_folder = "unit1"
    except:
        unit_folder = "unit1"

    image_folder = os.path.join(unit_folder, "images")
    os.makedirs(image_folder, exist_ok=True)

    try:
        show_btn = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-testid='practice-view__toggle-solution-btn']"))
        )
        show_btn.click()
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".markdown-renderer-v2")))
        print("✅ Solution revealed.")
    except Exception as e:
        print(f"⚠️ Could not click 'Show Solution': {e}")

    try:
        output["title"] = driver.find_element(By.CSS_SELECTOR, "h1.u-mar_0").text
        output["difficulty"] = driver.find_element(By.CSS_SELECTOR, "span.a-text.content-primary.a-text--size-xs").text
    except:
        pass

    try:
        question_body = driver.find_element(By.CSS_SELECTOR, "div.question-wrapper__body")
        markdown = question_body.find_element(By.CSS_SELECTOR, "div.markdown-renderer-v2")
        html = markdown.get_attribute("innerHTML")
        output["question_text"] = convert_mml_to_latex(html)

        time.sleep(1.5)

        img_tags = driver.find_elements(By.CSS_SELECTOR, "img.image-supplement__image")
        for i, img in enumerate(img_tags):
            img_url = img.get_attribute("src")
            ext = os.path.splitext(urlparse(img_url).path)[1] or ".png"
            local_filename = f"{question_id}-img{i+1}{ext}"
            local_path = os.path.join(image_folder, local_filename)

            try:
                img_data = requests.get(img_url).content
                with open(local_path, "wb") as f:
                    f.write(img_data)
                output["image_files"].append(f"images/{local_filename}")
                print(f"🖼 Downloaded: {local_filename}")
            except Exception as e:
                print(f"⚠️ Image download failed: {e}")

    except Exception as e:
        print(f"⚠️ Failed to extract question text or images: {e}")

    try:
        choice_elements = driver.find_elements(By.CSS_SELECTOR, "label.mcq-option")
        for el in choice_elements:
            letter = el.find_element(By.CSS_SELECTOR, ".mcq-option__letter").text.strip()
            content_blocks = el.find_elements(By.CSS_SELECTOR, ".mcq-option__content .paragraph")
            content_html = " ".join(p.get_attribute("innerHTML") for p in content_blocks)
            content_text = convert_mml_to_latex(content_html)
            output["choices"].append({"letter": letter, "text": content_text})
    except:
        print("⚠️ Could not extract choices")

    try:
        explanation_heading = driver.find_element(By.XPATH, "//h2[contains(text(), 'Explanation')]")
        explanation_block = explanation_heading.find_element(By.XPATH, "following-sibling::div[contains(@class, 'markdown-renderer-v2')]")
        paragraphs = explanation_block.find_elements(By.CSS_SELECTOR, "div.paragraph")

        correct_answer_html = ""
        solution_html = []
        distractors_html = {}

        section = None

        for i, p in enumerate(paragraphs):
            inner_html = p.get_attribute("innerHTML").strip().lower()

            if i == 0:
                correct_answer_html = convert_mml_to_latex(p.get_attribute("innerHTML"))

            elif "<u>solution</u>" in inner_html:
                section = "solution"

            elif "<u>explanation of distractors</u>" in inner_html:
                section = "distractors"

            elif section == "solution":
                solution_html.append(convert_mml_to_latex(p.get_attribute("innerHTML")))

            elif section == "distractors":
                plain = p.text.strip()
                match = re.search(r"choice\\s+'([a-e])'", plain.lower())
                if match:
                    letter = match.group(1).upper()
                    distractors_html[letter] = convert_mml_to_latex(p.get_attribute("innerHTML"))

        output["correct_answer"] = correct_answer_html
        output["solution"] = "\n\n".join(solution_html)
        output["distractor_explanations"] = distractors_html

    except Exception as e:
        print(f"⚠️ Failed to extract solution, distractors, or correct answer: {e}")

    try:
        tag_section = driver.find_element(By.CSS_SELECTOR, "div.tags-standards-list.u-mar-b_2")
        tag_links = tag_section.find_elements(By.CSS_SELECTOR, "a.a-chip")
        output["tags"] = [t.text.strip() for t in tag_links]

        standard_section = driver.find_element(By.CSS_SELECTOR, "div.tags-standards-list.u-mar-b_2.u-mar-t_6")
        standard_links = standard_section.find_elements(By.CSS_SELECTOR, "a.a-chip")
        output["standards"] = [s.text.strip() for s in standard_links]

    except Exception as e:
        print(f"⚠️ Failed to extract tags or standards: {e}")

    os.makedirs(unit_folder, exist_ok=True)
    out_path = f"{unit_folder}/{question_id}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved {question_id} to {out_path}")

    try:
        buttons = driver.find_elements(By.CSS_SELECTOR, "button")
        next_chevron = next(
            btn for btn in buttons if "fa-chevron-right" in btn.get_attribute("innerHTML") and "--disabled" not in btn.get_attribute("class")
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", next_chevron)
        wait.until(EC.element_to_be_clickable(next_chevron))
        next_chevron.click()
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.question-wrapper__body"))
        )
        print("➡️ Clicked chevron-right to load next question.")
        return True

    except Exception as e:
        print(f"⚠️ Could not click chevron-right: {e}")
        return False


In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import re




# Setup Selenium
# options = Options()
# options.add_argument("--start-maximized")
# driver = webdriver.Chrome(service=Service(), options=options)

# MANUALLY LOG IN FIRST (Google login + navigate to first question)

input("Log in manually and navigate to the first question. Press ENTER to begin scraping...")

# Step 1: Get total number of questions from header
try:
    total_text = driver.find_element(By.CSS_SELECTOR, "span.questions-list-header__heading").text
    match = re.search(r"(\d+)\s+questions", total_text.lower())
    total_questions = int(match.group(1)) if match else None
    total_questions = total_questions
    print(f"📊 Total questions found: {total_questions}")
except Exception as e:
    print(f"⚠️ Failed to extract total question count: {e}")
    total_questions = None



i = 1
while total_questions is None or i <= total_questions:
    question_id = f"question-{i:03d}"
    print(f"\n🧠 Scraping {question_id} of {total_questions if total_questions else '???'}")
    success = scrape_question(driver, question_id)

    if not success:
        print("✅ Finished scraping all questions or hit end.")
        break

    i += 1


Log in manually and navigate to the first question. Press ENTER to begin scraping... 


📊 Total questions found: 71

🧠 Scraping question-001 of 71
✅ Solution revealed.
🖼 Downloaded: question-001-img1.jpeg
✅ Saved question-001 to unit9/question-001.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping question-002 of 71
✅ Solution revealed.
✅ Saved question-002 to unit9/question-002.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping question-003 of 71
✅ Solution revealed.
✅ Saved question-003 to unit9/question-003.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping question-004 of 71
✅ Solution revealed.
✅ Saved question-004 to unit9/question-004.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping question-005 of 71
✅ Solution revealed.
✅ Saved question-005 to unit9/question-005.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping question-006 of 71
✅ Solution revealed.
✅ Saved question-006 to unit9/question-006.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping question-007 of 71
✅ Solution revea