In [58]:
# imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json

In [59]:
# once you have a page open, scraping the classes on that page
def scrape_class_data(driver, institution_name):
    print(f"\n{'='*20} Scraping courses for {institution_name} {'='*20}")
    
    courses = []
  
    # get the number of courses
    elements = driver.find_elements(By.CSS_SELECTOR, '[id^="gdvCourseEQ_lblReceiveCourseCode_"]')
    num_courses = len(elements)
    

    # loop through each course and get the other uni name and rpi course names
    for i in range(num_courses):
        rpi_course_name = driver.find_element(By.ID, f"gdvCourseEQ_lblReceiveCourseCode_{i}")
        alt_course_name = driver.find_element(By.ID, f"gdvCourseEQ_btnViewCourseEQDetail_{i}")

        courses_data = {
            "alt_course_name": alt_course_name.text,
            "rpi_course_name": rpi_course_name.text
        }
        courses.append(courses_data)
    
    print(f"Found {num_courses} courses on current page")
    return courses


In [60]:
# scaping a page itself (deals with pagination and what not)
def scrape_uni_page(driver, institution_name):
    print(f"\n{'='*20} Processing {institution_name} {'='*20}")
    # get the first page
    all_courses = scrape_class_data(driver, institution_name)

    try:
        # number of pages
        pagination_table = driver.find_element(By.CLASS_NAME, "pagination-tes").find_element(By.TAG_NAME, "table")
        page_cells = pagination_table.find_elements(By.TAG_NAME, "td")
        num_pages = len(page_cells)
        print(f"Found {num_pages} total pages to process")

        for page in range(2, num_pages + 1):
            print(f"\n--- Processing Page {page} ---")
            # Need to refind the pagination table and link each time since page reloads
            pagination_table = driver.find_element(By.CLASS_NAME, "pagination-tes").find_element(By.TAG_NAME, "table")
            page_link = pagination_table.find_element(By.LINK_TEXT, str(page))
            page_link.click()
            time.sleep(2)
            # get the courses and extend
            curr_page_courses = scrape_class_data(driver, institution_name)
            all_courses.extend(curr_page_courses)
    except:
        # No pagination table found - just continue with single page results
        print("No pagination found - single page only")
 
    print(f"\nTotal courses found for {institution_name}: {len(all_courses)}")
    return all_courses

In [61]:

# nice function that lets u navigate to a specific page and deals with the annoying ... pagination at the top. 
# This is for selecting the page 
def navigate_to_page(driver, page_number):
    # Find the pagination table
    pagination_table = driver.find_element(By.CLASS_NAME, "pagination-tes").find_element(By.TAG_NAME, "table")
    
    if page_number > 10:
        # Need to click dots first to see pages 11-14
        dots_link = pagination_table.find_element(By.LINK_TEXT, "...")
        dots_link.click()
        time.sleep(2)
        
        # Re-find pagination table since page updated
        pagination_table = driver.find_element(By.CLASS_NAME, "pagination-tes").find_element(By.TAG_NAME, "table")
        
        # Don't need to click page number if it's 11 since dots takes us there
        if page_number > 11:
            page_link = pagination_table.find_element(By.LINK_TEXT, str(page_number))
            page_link.click()
            time.sleep(2)
    else:
        # For pages 2-10, just click the page number
        page_link = pagination_table.find_element(By.LINK_TEXT, str(page_number)) 
        page_link.click()
        time.sleep(2)

In [62]:

# driver function
def scrape_institutions(driver, page_numbers):
    all_institutions_data = []


    for page_number in page_numbers:
        # Visit the website if first page, otherwise navigate
        if page_number == page_numbers[0]:
            url = "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
            driver.get(url)
            
            time.sleep(60) # in case captcha is triggered, you might have to manually solve yourself
            if page_number > 1:
                navigate_to_page(driver, page_number)
        else:
            navigate_to_page(driver, page_number)

        # Loop through buttons 0-49
        for i in range(50):
            # Find and click the button with dynamic index
            button_id = f"gdvInstWithEQ_btnCreditFromInstName_{i}"
            button = driver.find_element(By.ID, button_id)
            institution_name = button.text
            print(f"\n{'#'*80}\nProcessing institution {i+1}/50: {institution_name}\n{'#'*80}")
            try:
                button.click()
                # Wait for data to change by checking for presence of course table
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.ID, "gdvCourseEQ"))
                )
            except:
                # if we fail, notify and just refresh the page, solve the captcha, continue
                print(f"Failed to click institution {institution_name} with page number {page_number}")
                time.sleep(90)
                continue
                
            # scrape the page
            courses = scrape_uni_page(driver, institution_name)

            institution_data = {
                "institution_name": institution_name,
                "courses": courses
            }
            all_institutions_data.append(institution_data)
            # click back to the first page, click btnSwitchView
            driver.find_element(By.ID, "btnSwitchView").click()
            # Wait for the institution list to be visible again
            try:
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.ID, "gdvInstWithEQ"))
                )
            except:
                # if we fail, notify and just refresh the page, solve the captcha, continue. This happens like 0.5 times per page
                # depending on how many you're scraping at once
                print(f"Failed to click back to first page after {institution_name}, with page number {page_number}")
                time.sleep(90)
                continue

    print(f"\n{'='*50}\nProcessing Complete\n{'='*50}")
    print(f"Total institutions processed: {len(all_institutions_data)}")
    for inst in all_institutions_data:
        print(f"\nInstitution: {inst['institution_name']}")
        print(f"Number of courses: {len(inst['courses'])}")
        
    return all_institutions_data




In [63]:
import threading
all_data = []
lock = threading.Lock()

# drivers driver function lol
def process_pages(page_range):
    driver = webdriver.Chrome(options=webdriver.ChromeOptions())
    try:
        data = scrape_institutions(driver, page_range)
        if data:
            with lock:
                all_data.extend(data)
    except Exception as e:
        print(f"Error processing pages {page_range}: {str(e)}")
    finally:
        driver.quit()

# Define the page ranges for each thread, run a single browser per thread
# I generally did 4 because it was fast enough to where if the captcha was triggered,
# you could just solve it yourself. 
page_ranges = [
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12],
    [13, 14] 
]

# uncomment to run

# threads = []
# for page_range in page_ranges:
#     thread = threading.Thread(target=process_pages, args=(page_range,))
#     threads.append(thread)
#     thread.start()
#     print(f"Started thread for pages {page_range}")

# # Wait for all threads to complete
# for thread in threads:
#     thread.join()

# print(f"Total institutions collected: {len(all_data)}")

In [64]:
# rerun a specific page, which can be useful if u mess up, uncomment to run
# page_range = [8, 8]
# data = scrape_institutions(page_range)
# all_data.extend(data)

In [65]:
import pandas as pd
# converting to csv and excel
with open('all_data_clean.json', 'r') as f:
    all_data = json.load(f)

    # Convert the data to a pandas DataFrame for easier spreadsheet export
    import pandas as pd
    
    # Flatten the nested data structure
    flattened_data = []
    for institution in all_data:
        inst_name = institution['institution_name']
        for course in institution['courses']:
            course_data = {
                'Institution': inst_name,
                'Alt Course': course['alt_course_name'], 
                'RPI Course': course['rpi_course_name']   
            }
            flattened_data.append(course_data)
    
    df = pd.DataFrame(flattened_data)
    
    df.to_excel('course_data.xlsx', index=False)

    df.to_csv('course_data.csv', index=False)

    print(f"Data exported to course_data.xlsx with {len(df)} courses from {len(all_data)} institutions")


Data exported to course_data.xlsx with 3159 courses from 388 institutions
