# Scraping Gradescope

Why is this necessary? Because Gradescope doesn't provide any way to export a complete list of all rubric items with each student's applied items, just aggregated scores on a per-problem or per-assignment basis.

We need everything. All of it.

So this is a notebook to automate some of the scraping.

I really want to control an existing chrome profile, but sadly this seems annoyingly difficult to figure out for selenium 4, so we'll have to settle for manual login...

In [1]:
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# this may need to be updated
# check chrome version at chrome://version
chrome_version = "125.0.6422.142"

chromedriver_path = ChromeDriverManager(driver_version=chrome_version).install()
print(chromedriver_path)

C:\Users\woods\.wdm\drivers\chromedriver\win64\125.0.6422.142\chromedriver-win32/chromedriver.exe


In [4]:
BASE_URL = "https://www.gradescope.com"
ASSIGNMENTS_URL = "https://www.gradescope.com/courses/489293/assignments"

EXTRACT_ASSIGNMENTS = ["Exam 5D"]

In [5]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.common.exceptions import TimeoutException
from collections import defaultdict
import time

options = webdriver.ChromeOptions()

service = ChromeService(chromedriver_path)


def scrape_students(driver):
    """get students from rubric item page"""
    tr_elems = driver.find_elements(By.XPATH, "//table[contains(@class, 'js-rubricItemSubmissions')]//tr")
    return [student_tr.text for student_tr in tr_elems]

def scrape_assignment(assignment_nam, assignment_href, driver):
    """a helper function to generate a json with the 
    students organized by rubric items"""
    # go to assignment statistics page if page exists
    driver.get(f"{BASE_URL}{assignment_href}/statistics")
    time.sleep(1)

    # extract links to each question page
    question_hrefs = {}
    for question_href in driver.find_elements(By.XPATH,
            "//div[@class='statisticsItem--title']/a"):
        question_hrefs[question_href.text] = question_href.get_dom_attribute('href')

    # visit each question page
    question_rubrics = {}
    for question_title, question_href in question_hrefs.items():
        driver.get(f"{BASE_URL}{question_href}")
        try:
            WebDriverWait(driver, 5).until(
                expected_conditions.presence_of_element_located((By.XPATH, "//table[@class='table']/tbody/tr"))
            )
        except TimeoutException as e:
            print("timeout occured")
            raise e
        
        # get all rubric items, then for each rubric item,
        # visit the rubric item page to extract the students to 
        # whom the item was applied

        # column_title_elems = driver.find_elements(By.XPATH, 
        #     "//td[@class='statisticsTable--column questionRubricTable--column-title']")
        tr_elems = driver.find_elements(By.XPATH,
            "//table[@class='table']/tbody/tr")
        
        rubric_items = defaultdict(dict)
        rubric_header = None

        # get all the rubric items
        for tr_elem in tr_elems:
            column_title_elem = tr_elem.find_element(By.XPATH,
                "td[@class='statisticsTable--column questionRubricTable--column-title']")
            div_elem = column_title_elem.find_element(By.XPATH, "./div")
            class_tag = div_elem.get_dom_attribute('class')
            if not class_tag:
                # class tag is empty, which means this is a rubric header item
                current_header = div_elem.text
            try:
                link = div_elem.find_element(By.XPATH, './a')
            except NoSuchElementException:
                # header item has no link, so this is a rubric group name. move on to the next item.
                continue

            href = link.get_dom_attribute('href')
            points = tr_elem.find_element(By.XPATH,
                "td[@class='statisticsTable--column questionRubricTable--column-points']").text
            if class_tag == "table--childRow":
                # this rubric item belongs to a rubric group
                rubric_items[current_header][link.text] = { 'href': href, 'points': points }
            else:
                # this is a regular rubric item
                rubric_items[current_header] = { 'href': href, 'points': points }

        # visit each rubric item and get students
        for rubric_header, value in rubric_items.items():
            if "href" in value:
                # header item is a regular rubric item
                driver.get(f"{BASE_URL}{value['href']}")
                value['students'] = scrape_students(driver)
                continue
            for rubric_subheader, value in rubric_items[rubric_header].items():
                driver.get(f"{BASE_URL}{value['href']}")
                value['students'] = scrape_students(driver)

        question_rubrics[question_title] = rubric_items

    with open(f"{assignment_name}-rubric.json", 'w') as f:
        json.dump(question_rubrics, f, indent=2)
    

with webdriver.Chrome(service=service, options=options) as driver:
    driver.get(ASSIGNMENTS_URL)

    input("please log in before pressing any key to continue...")

    assignment_elems = driver.find_elements(By.XPATH, "//td/div[@class='assignments--rowTitle']")

    assignments = { }
    for assignment_elem in assignment_elems:
        href = assignment_elem.find_element(By.TAG_NAME, "a")
        assignments[assignment_elem.text] = href.get_dom_attribute('href')
    
    for assignment_name, href in assignments.items():
        if assignment_name in EXTRACT_ASSIGNMENTS:
            print(f"scraping assignment {assignment_name}: {href}")
            scrape_assignment(assignment_name, href, driver)

please log in before pressing any key to continue... 


scraping assignment Exam 5D: /courses/489293/assignments/2739647
