In [1]:
import os
import json
import re
import time
import requests
import html
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

tom = ["Easy", "Medium", "Hard"]
albert = ["Easy", "Moderate", "Difficult"]
difficulty = dict(zip(albert, tom))

In [2]:
# Set up browser
# options = Options()
# options.add_argument("--start-maximized")
# driver = webdriver.Chrome(options=options)

# # Step 1: Visit a site and log in manually if needed
# driver.get("https://albert.io/log-in")  # or any target domain

# input("🔐 Log in manually and press ENTER here when done...")

# # Step 2: Save cookies to a file
# cookies = driver.get_cookies()

# with open("cookies_albert.json", "w", encoding="utf-8") as f:
#     json.dump(cookies, f, indent=2)

# print("✅ Cookies saved to cookies_albert.json")

# driver.quit()

In [3]:
# Optional: Set up options
options = Options()
options.add_argument('--start-maximized')

driver = webdriver.Chrome(options=options)
driver.get('https://albert.io')  # Must visit domain before adding cookies

# Load and clean cookies
with open('cookies_albert.json', 'r') as f:
    cookies = json.load(f)

for cookie in cookies:
    # Clean invalid sameSite values
    if 'sameSite' in cookie and cookie['sameSite'] not in ['Strict', 'Lax', 'None']:
        del cookie['sameSite']  # or cookie['sameSite'] = 'Lax'

    # Remove unsupported fields
    cookie.pop('hostOnly', None)
    cookie.pop('storeId', None)
    cookie.pop('session', None)

    # Fix expiry format if present
    if 'expiry' in cookie and not isinstance(cookie['expiry'], int):
        del cookie['expiry']

    # Ensure cookie has required keys
    if 'name' in cookie and 'value' in cookie:
        try:
            driver.add_cookie(cookie)
        except Exception as e:
            print(f"❌ Skipping cookie '{cookie.get('name')}' → {e}")
    else:
        print(f"⚠️ Invalid cookie format skipped: {cookie}")

# Reload page with cookies applied
driver.refresh()

In [5]:
def wrap_latex_if_needed(text):
    stripped = text.strip()
    if re.match(r"^[-+*/()=0-9a-zA-Z.\\^ ]+$", stripped):
        return f"$$ {stripped} $$"
    return f"$ {stripped} $"

def convert_mml_to_latex(text):
    soup = BeautifulSoup(text, "html.parser")
    for math in soup.select("mjx-assistive-mml"):
        if math.has_attr("alttext"):
            latex = wrap_latex_if_needed(math["alttext"].strip())
            math.insert_before(latex)
            math.decompose()
    return soup.get_text(" ", strip=True)

def scrape_question(driver, wait):
    output = {
        "question_id": "",
        "unit": "",
        "topics":[],
        "title": "",
        "difficulty": "",
        "question_text": "",
        "image_files": [],
        "tables": [],
        "choices": [],
        "correct_answer": "",
        "solution": "",
        "explanations": [],
        "tags": [],
    }

    show_btn = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-testid='practice-view__toggle-solution-btn']"))
    )
    show_btn.click()
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".markdown-renderer-v2")))
    print("✅ Solution revealed.")
    
    # Get unit info
    unit_el = driver.find_element(By.CSS_SELECTOR, ".pv-title-bar__title .paragraph")
    unit_text = unit_el.text.strip()
    match = re.search(r"Unit (\d+)", unit_text)
    if match:
        unit_num = match.group(1)
        output["unit"] = unit_text
        folder = f"unit{unit_num}"
        image_folder = os.path.join(folder, "images")
        os.makedirs(image_folder, exist_ok=True)
        
    num = len([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])

    question_id = f"question-{num:03d}"
    output['question_id'] = question_id
    output["title"] = driver.find_element(By.CSS_SELECTOR, "h1.u-mar_0").text
    output["difficulty"] = difficulty[driver.find_element(By.CSS_SELECTOR, "span.a-text.content-primary.a-text--size-xs").text]
    
    question_body = driver.find_element(By.CSS_SELECTOR, "div.question-wrapper__body")
    markdown = question_body.find_element(By.CSS_SELECTOR, "div.markdown-renderer-v2")
    output["question_text"] = markdown.text
    
    img_tags = driver.find_elements(By.CSS_SELECTOR, "img.image-supplement__image")
    for i, img in enumerate(img_tags):
        img_url = img.get_attribute("src")
        ext = os.path.splitext(urlparse(img_url).path)[1] or ".png"
        local_filename = f"{question_id}-img{i+1}{ext}"
        local_path = os.path.join(image_folder, local_filename)

        try:
            img_data = requests.get(img_url).content
            with open(local_path, "wb") as f:
                f.write(img_data)
            output["image_files"].append(f"images/{local_filename}")
            print(f"🖼 Downloaded: {local_filename}")
        except Exception as e:
            print(f"⚠️ Image download failed: {e}")
    
    tables = markdown.find_elements(By.CSS_SELECTOR, "div.markdown-table-wrapper")
    for table in tables:
        output["tables"].append(table.get_attribute("innerHTML"))
    
    choice_elements = driver.find_elements(By.CSS_SELECTOR, "label.mcq-option")
    for el in choice_elements:
        letter = el.find_element(By.CSS_SELECTOR, ".mcq-option__letter").text.strip()
        content_blocks = el.find_elements(By.CSS_SELECTOR, ".mcq-option__content .paragraph")
        content = "\n".join(p.text for p in content_blocks)
        output["choices"].append({"letter": letter, "text": content})


    
    explanation_heading = driver.find_element(By.XPATH, "//h2[contains(text(), 'Explanation')]")
    explanation_block = explanation_heading.find_element(By.XPATH, "following-sibling::div[contains(@class, 'markdown-renderer-v2')]")
    paragraphs = explanation_block.find_elements(By.CSS_SELECTOR, "div.paragraph")

    correct_answer = ""
    solution = []
    distractors = {}

    section = None

    for i, p in enumerate(paragraphs):
        inner_html = p.get_attribute("innerHTML").strip().lower()

        if i == 0:
            correct_answer = p.text

        elif "<u>solution</u>" in inner_html:
            section = "solution"

        elif "<u>explanation of distractors</u>" in inner_html:
            section = "distractors"

        elif section == "solution":
            solution.append(p.text)

        elif section == "distractors":
            plain = p.text.strip()
            match = re.search(r"choice\\s+'([a-e])'", plain.lower())
            if match:
                letter = match.group(1).upper()
                distractors[letter] = p.text

    output["correct_answer"] = correct_answer
    output["solution"] = "\n\n".join(solution)
    output["explanations"] = distractors
    
    tag_section = driver.find_elements(By.CSS_SELECTOR, "div.tags-standards-list.u-mar-b_2")
    tag_links = tag_section[0].find_elements(By.CSS_SELECTOR, "a.a-chip")
    output["tags"] = [t.text.strip() for t in tag_links]
    if len(tag_section) > 1:
        top_links = tag_section[1].find_elements(By.CSS_SELECTOR, "a.a-chip")
        output["topics"] = [t.text.strip() for t in top_links]
    
    os.makedirs(folder, exist_ok=True)
    out_path = f"{folder}/{question_id}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved {question_id} to {out_path}")
    
    if num % 20 == 0:
        time.sleep(5)
        print("I waited")
    
    try:
        buttons = driver.find_elements(By.CSS_SELECTOR, "button")
        next_chevron = next(
            btn for btn in buttons if "fa-chevron-right" in btn.get_attribute("innerHTML") and "--disabled" not in btn.get_attribute("class")
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", next_chevron)
        wait.until(EC.element_to_be_clickable(next_chevron))
        next_chevron.click()
        wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.question-wrapper__body"))
        )
        print("➡️ Clicked chevron-right to load next question.")
        return True

    except Exception as e:
        print(f"⚠️ Could not click chevron-right: {e}")
        return False

        
u_pages = ['//*[@id="-unit-1-exploring-onevariable-data"]','//*[@id="-unit-2-exploring-twovariable-data"]', '//*[@id="-unit-3-collecting-data"]', '//*[@id="-unit-4-probability-random-variables-and-probability-distributions"]', '//*[@id="-unit-5-sampling-distributions"]', '//*[@id="-unit-6-inference-for-categorical-data-proportions"]', '//*[@id="-unit-7-inference-for-quantitative-data-means"]', '//*[@id="-unit-8-inference-for-categorical-data-chisquare"]', '//*[@id="-unit-9-inference-for-quantitative-data-slopes"]']
a_pages = ['//*[@id="ap-statistics-unit-1-assessment"]', '//*[@id="ap-statistics-unit-2-assessment"]', '//*[@id="ap-statistics-unit-3-assessment"]', '//*[@id="ap-statistics-unit-1-assessment"]', '//*[@id="ap-statistics-unit-5-assessment"]', '//*[@id="ap-statistics-unit-6-assessment"]', '//*[@id="ap-statistics-unit-7-assessment"]', '//*[@id="ap-statistics-unit-8-assessment"]', '//*[@id="ap-statistics-unit-9-assessment"]']
for page in a_pages:
    driver.get('https://www.albert.io/ap-statistics/assessments')
    wait = WebDriverWait(driver, 5)
    select = wait.until(EC.element_to_be_clickable((By.XPATH, page)))
    select.click()
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "span.questions-list-header__heading")))
    
    total_text = driver.find_element(By.CSS_SELECTOR, "span.questions-list-header__heading").text
    match = re.search(r"(\d+)\s+questions", total_text.lower())
    total_questions = int(match.group(1)) if match else None
    total_questions = total_questions
    print(f"📊 Total questions found: {total_questions}")
    
    i = 1
    while total_questions is None or i <= total_questions:

        print(f"\n🧠 Scraping {i} of {total_questions if total_questions else '???'}")
        success = scrape_question(driver, wait)

        if not success:
            print("✅ Finished scraping all questions or hit end.")
            break

        i += 1

📊 Total questions found: 15

🧠 Scraping 1 of 15
✅ Solution revealed.
✅ Saved question-268 to unit1/question-268.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping 2 of 15
✅ Solution revealed.
✅ Saved question-269 to unit1/question-269.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping 3 of 15
✅ Solution revealed.
✅ Saved question-270 to unit1/question-270.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping 4 of 15
✅ Solution revealed.
🖼 Downloaded: question-271-img1.png
✅ Saved question-271 to unit1/question-271.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping 5 of 15
✅ Solution revealed.
✅ Saved question-272 to unit1/question-272.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping 6 of 15
✅ Solution revealed.
🖼 Downloaded: question-273-img1.png
✅ Saved question-273 to unit1/question-273.json
➡️ Clicked chevron-right to load next question.

🧠 Scraping 7 of 15
✅ Solution revealed.
✅ Saved question-274 to unit1/questio