In [1]:
import os
import json
import re
import time
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def get_total_question_count(soup):
    try:
        count_text = soup.select_one("div.h-\\[32px\\]").text.strip()
        match = re.search(r"of\s+(\d+)", count_text)
        if match:
            return int(match.group(1))
    except Exception as e:
        print(f"⚠️ Failed to get total question count: {e}")
    return None

def wrap_latex_if_needed(text):
    """
    Wrap a math expression in $...$ if it's inline, or $$...$$ if it's on its own line.
    """
    stripped = text.strip()
    if re.match(r"^[-+*/()=0-9a-zA-Z.\\^ ]+$", stripped):
        return f"$$ {stripped} $$"
    return f"$ {stripped} $"

def convert_mml_to_latex(text):
    """
    Convert embedded MathJax or MathML alttext to LaTeX format with appropriate wrapping.
    """
    soup = BeautifulSoup(text, "html.parser")
    for math in soup.select("mjx-assistive-mml"):
        if math.has_attr("alttext"):
            latex = wrap_latex_if_needed(math["alttext"].strip())
            math.insert_before(latex)
            math.decompose()
    return soup.get_text(" ", strip=True)

def scrape_all_questions(driver, total_questions):
    i = 1
    unit_question_counts = {}
    
    while total_questions is None or i <= total_questions:
        print(f"\n🧠 Scraping {question_id} of {total_questions if total_questions else '???'}")        
        
        # Determine unit number from current page
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        unit_tag = soup.find("h3", string="Unit And Topic")
        if not unit_tag:
            print("⚠️ Unit not found, skipping.")
            break

        unit_list = unit_tag.find_next("ul")
        unit_items = unit_list.find_all("li")
        unit_text = unit_items[0].text.strip() if unit_items else "unitX"
        unit_match = re.search(r"(\d+)", unit_text)
        unit_folder = f"unit{unit_match.group(1)}" if unit_match else "unitX"

        # Count existing JSON files to resume numbering
        if unit_folder not in unit_question_counts:
            os.makedirs(unit_folder, exist_ok=True)
            image_folder = os.path.join(unit_folder, "images")
            os.makedirs(image_folder, exist_ok=True)
            existing_files = [f for f in os.listdir(unit_folder) if f.endswith(".json") and f.startswith("q")]
            existing_ids = [int(re.search(r"q(\d+).json", f).group(1)) for f in existing_files if re.search(r"q(\d+).json", f)]
            unit_question_counts[unit_folder] = max(existing_ids) + 1 if existing_ids else 1

        question_num = unit_question_counts[unit_folder]
        question_id = f"q{question_num:04d}"

        success = scrape_question(driver, question_id)
        if not success:
            print("✅ Finished scraping all questions or hit end.")
            break
            
        unit_question_counts[unit_folder] += 1
        i += 1
        
def scrape_question(driver, question_id):
    output = {
        "question_id": question_id,
        "title": "",
        "difficulty": "",
        "question_text": "",
        "image_files": [],
        "choices": [],
        "correct_answer": "",
        "solution": "",
        "distractor_explanations": {},
        "tags": [],
        "standards": [],
        "unit": "",
        "topic": "",
        "stimulus_type": "",
        "table_html": ""
    }

    wait = WebDriverWait(driver, 10)
    time.sleep(1.5)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    try:
        title_tag = soup.select_one("h2 span.question-text")
        if title_tag:
            output["title"] = title_tag.text.strip().replace("Q: ", "")
    except:
        pass

    try:
        unit_topic_block = soup.find("h3", string="Unit And Topic")
        if unit_topic_block:
            unit_list = unit_topic_block.find_next("ul")
            unit_items = unit_list.find_all("li")
            if unit_items:
                output["unit"] = unit_items[0].text.strip()
                if len(unit_items) > 1:
                    output["topic"] = unit_items[1].text.strip()
    except Exception as e:
        print(f"⚠️ Failed to extract unit/topic: {e}")

    try:
        tag_lis = soup.select(".ItemMetadata__tags li")
        for li in tag_lis:
            if ":" in li.text:
                key = li.text.split(":")[0].strip()
                output["tags"].append(key)

            if "Question Difficulty" in li.text:
                difficulty_value = li.text.split(":")[-1].strip()
                if difficulty_value == "Emerging":
                    output["difficulty"] = "Easy"
                elif difficulty_value == "Proficient":
                    output["difficulty"] = "Moderate"
                elif difficulty_value == "Advanced":
                    output["difficulty"] = "Difficult"
    except Exception as e:
        print(f"⚠️ Failed to extract tags: {e}")

    try:
        stim_label = soup.find("label", string=re.compile("Stimulus Type"))
        if stim_label:
            parent_li = stim_label.find_parent("li")
            if parent_li:
                stim_type = parent_li.text.split(":")[-1].strip()
                output["stimulus_type"] = stim_type

                unit_num = re.search(r"(\d+)", output["unit"])
                unit_folder = f"unit{unit_num.group(1)}" if unit_num else "unitX"
                image_folder = os.path.join(unit_folder, "images")
                os.makedirs(image_folder, exist_ok=True)

                def download_image(img_url, suffix):
                    ext = os.path.splitext(urlparse(img_url).path)[1] or ".png"
                    local_filename = f"{question_id}-{suffix}{ext}"
                    local_path = os.path.join(image_folder, local_filename)
                    if not os.path.exists(local_path):
                        try:
                            img_data = requests.get(img_url).content
                            with open(local_path, "wb") as f:
                                f.write(img_data)
                            print(f"🖼 Downloaded image: {local_filename}")
                        except Exception as e:
                            print(f"⚠️ Failed to download image: {e}")
                    else:
                        print(f"🖼 Image already exists: {local_filename}")
                    return f"images/{local_filename}"

                if stim_type.lower() == "table":
                    table_wrapper = soup.find("div", class_="table_wrapper")
                    if table_wrapper:
                        output["table_html"] = str(table_wrapper)

                elif stim_type.lower() == "graph":
                    image_tag = soup.select_one(".standalone_image img")
                    if image_tag and image_tag.has_attr("src"):
                        img_path = download_image(image_tag["src"], "graph")
                        output["image_files"].append(img_path)

                elif stim_type.lower() == "plot":
                    stimulus_block = soup.select_one(".lrn_stimulus_content")
                    if stimulus_block:
                        image_tag = stimulus_block.find("img")
                        if image_tag and image_tag.has_attr("src"):
                            img_path = download_image(image_tag["src"], "plot")
                            output["image_files"].append(img_path)

                elif stim_type.lower() == "computer output":
                    table_wrapper = soup.find("div", class_="table_wrapper")
                    if table_wrapper:
                        output["table_html"] = str(table_wrapper)
                    else:
                        image_tag = soup.select_one(".stimulus_reference img")
                        if image_tag and image_tag.has_attr("src"):
                            img_path = download_image(image_tag["src"], "computer-output")
                            output["image_files"].append(img_path)

    except Exception as e:
        print(f"⚠️ Failed to extract stimulus/table/graph/plot/computer output: {e}")

    try:
        question_block = soup.select_one(".lrn_stimulus_content")
        if question_block:
            output["question_text"] = convert_mml_to_latex(str(question_block))

        choice_blocks = soup.select(".lrn-mcq-option")
        for choice in choice_blocks:
            label = choice.select_one(".lrn-possible-answer")
            if label:
                mml = label.select_one("mjx-assistive-mml")
                if mml and mml.has_attr("alttext"):
                    latex = wrap_latex_if_needed(mml["alttext"].strip())
                    output["choices"].append(latex)
                    if "--correct" in choice.get("class", []):
                        output["correct_answer"] = latex
                else:
                    text = label.get_text(strip=True)
                    output["choices"].append(text)
                    if "--correct" in choice.get("class", []):
                        output["correct_answer"] = text
    except Exception as e:
        print(f"⚠️ Failed to extract question/choices/answer: {e}")

    try:
        sol_block = soup.select_one(".LearnosityDistractor.--valid .content")
        if sol_block:
            output["solution"] = convert_mml_to_latex(str(sol_block))
    except Exception as e:
        print(f"⚠️ Failed to extract solution: {e}")

    unit_num = re.search(r"(\d+)", output["unit"])
    unit_folder = f"unit{unit_num.group(1)}" if unit_num else "unitX"
    os.makedirs(unit_folder, exist_ok=True)
    out_path = os.path.join(unit_folder, f"{question_id}.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved {question_id} to {out_path}")

    try:
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[data-cy="next-button"]')))
        next_button.click()
        print("➡️ Clicked Next")
    except Exception as e:
        print(f"⚠️ Failed to click Next button: {e}")
        return False

    return True

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import re




# Setup Selenium
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(), options=options)

# MANUALLY LOG IN FIRST (Google login + navigate to first question)

input("Log in manually and navigate to the first question. Press ENTER to begin scraping...")


# Step 1: Get total number of questions from header
try:
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    total_questions = get_total_question_count(soup)
    print(f"📊 Total questions to scrape: {total_questions}")

except Exception as e:
    print(f"⚠️ Failed to extract total question count: {e}")
    total_questions = None

scrape_all_question(driver, total_questions)


Log in manually and navigate to the first question. Press ENTER to begin scraping... 


📊 Total questions to scrape: 967

🧠 Scraping question-001 of 967
✅ Saved question-001 to unit5/question-001.json
➡️ Clicked Next

🧠 Scraping question-002 of 967
✅ Saved question-002 to unit8/question-002.json
➡️ Clicked Next

🧠 Scraping question-003 of 967
✅ Saved question-003 to unit2/question-003.json
➡️ Clicked Next

🧠 Scraping question-004 of 967
✅ Saved question-004 to unit8/question-004.json
➡️ Clicked Next

🧠 Scraping question-005 of 967
✅ Saved question-005 to unit1/question-005.json
➡️ Clicked Next

🧠 Scraping question-006 of 967
✅ Saved question-006 to unit6/question-006.json
➡️ Clicked Next

🧠 Scraping question-007 of 967
✅ Saved question-007 to unit7/question-007.json
➡️ Clicked Next

🧠 Scraping question-008 of 967
✅ Saved question-008 to unit1/question-008.json
➡️ Clicked Next

🧠 Scraping question-009 of 967
✅ Saved question-009 to unit9/question-009.json
➡️ Clicked Next

🧠 Scraping question-010 of 967
🖼 Downloaded graph image: question-010-graph.png
✅ Saved question-010 t

KeyboardInterrupt: 