WebScrapper for EWeb courses Canavan

In [None]:
#imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import sys
from selenium.webdriver.support.select import Select
import pandas as pd
import re
import os
from datetime import datetime

In [None]:
# Create directories if they don't exist
os.makedirs("webscrapedInfo", exist_ok=True)
os.makedirs("intermediateFiles", exist_ok=True)
os.makedirs("finalFiles", exist_ok=True)

In [None]:
# Get user input for filename prefix
filename_prefix = input("Enter the your prefered filename + the semester you are viewing: ")
date_str = datetime.today().strftime('%Y-%m-%d')

# Define filenames
# Define filenames with respective directories
all_courses_filename = f"webscrapedInfo/{filename_prefix}_{date_str}.csv"
instructor_schedule_filename = f"intermediateFiles/{filename_prefix}_Instructor_Course_Schedule_{date_str}.csv"
instructor_split_filename = f"intermediateFiles/{filename_prefix}_Instructor_Course_Schedule_MultipleInstructorSplit_{date_str}.csv"
instructor_final_filename = f"intermediateFiles/{filename_prefix}_Instructor_Course_Schedule_Final_{date_str}.csv"
instructor_final_formatted_filename = f"intermediateFiles/{filename_prefix}_Instructor_Course_Schedule_Final_Formatted_{date_str}.csv"
instructor_final_formatted1_filename = f"intermediateFiles/{filename_prefix}_Instructor_Course_Schedule_Final_Formatted1_{date_str}.csv"
prolog_ready_filename = f"finalFiles/{filename_prefix}_prologReadyCourses_{date_str}.csv"
prolog_ready_no_tba_filename = f"finalFiles/{filename_prefix}_prologReadyCoursesWithoutTBA_{date_str}.csv"

In [None]:
#Driver for firefox and loading the Eweb url
driver = webdriver.Firefox()
driver.implicitly_wait(10)

driver.get("https://ssb-prod.ec.easternct.edu/PROD/bwskfcls.p_termsel")

In [None]:
#select term
elem = driver.find_elements(by = 'id', value = 'term_id')
semester = Select(elem[0])
current_semester = semester.first_selected_option.text
x = input(f"The schedule currently out is for {current_semester} semester. Would you like to view the schedule from one semester back? Type 'back' to view the last semester or type 'stay'to view the current semester: ")
# select term
if x == 'back':
	s = Select(elem[0])
	s.select_by_index(2)
def selTerm():
    if x != 'back'|'stay':
        print('Invalid Entry, please state "back" or "stay')
        selTerm()


In [None]:
# submit term
elem= driver.find_elements(by = 'tag name', value = 'input')
elem[1].click()

In [None]:
#selecting both
both = driver.find_element(by = 'id', value = 'oc_id')
both.click()

In [None]:
#searching
btn = driver.find_element(by = 'name', value = 'SUB_BTN')
btn.click()

In [None]:
    # Locate all tables on the page
tbl = driver.find_elements("tag name", "table")


In [None]:
#This is where all the information will be stored and appened to for .csv exporting
info = []

for tbl_index, table in enumerate(tbl[2:], start=3):  # Enumerate
    
    # Locate the rows in the respective table
    rows = table.find_elements("tag name", "tr")
    
    # Loop through all rows starting with the 4th row 
    for row_index, row in enumerate(rows[3:], start=4):  # Enumerate from 4th row
        
        # Locate all the columns in this row
        col = row.find_elements("tag name", "td")
        
        # Using the columns that hold desired data(crse,sec,day,time,room,instrcutor)
        indices = [3,4, 5, 10, 11, 20, 21]
        data = []
        for col_index in indices:
            # Avoid error if column is not present
            if col_index < len(col):
                data.append(col[col_index].text)
            else:
                data.append("N/A")
        
        # Add the extracted data to info
        info.append(data)

# Creating of Pandas data frame
df = pd.DataFrame(info, columns=["Sub","Crse", "Sec", "Day", "Time", "Room", "Instructor"])

#Resolving blanks
for row in range(len(df)):  # Loop through row indices
    if df.iloc[row, 2] == ' ':  # Check if Sec has ' '
        df.iloc[row, 0] = df.iloc[row-1, 0]  # Copy values from the previous row
        df.iloc[row, 1] = df.iloc[row-1, 1] 
        df.iloc[row, 2] = df.iloc[row-1, 2]

# Export the DataFrame to a CSV file
df.to_csv(all_courses_filename, index=False)

print(f"Data exported to {all_courses_filename}")


In [None]:
#Close the webpage
driver.quit()

In [None]:
# Load the previously exported CSV file
df = pd.read_csv(all_courses_filename)

# Select only the necessary columns and drop duplicates to create unique instructor-course-day-time rows
df_unique = df[["Instructor", "Sub", "Crse", "Sec", "Day", "Time", "Room"]].drop_duplicates()

# Filter out rows where the instructor is "TBA"
df_filtered = df_unique[df_unique["Instructor"] != "TBA"]

# Remove "(P)" from instructor names
df_filtered.loc[:, "Instructor"] = df_filtered["Instructor"].str.replace(r"\(P\)", "", regex=True).str.strip()

df_filtered.to_csv(instructor_schedule_filename, index=False)

print(f"Instructor course schedule exported to {instructor_schedule_filename}")

In [None]:
#splitmultipleInstructorclasses
# Load the cleaned CSV file
df = pd.read_csv(instructor_schedule_filename)

# Function to split on every second comma while keeping instructor names intact
def split_instructors(instructor):
    # This regex captures pairs like "Last, First" while splitting on every second comma
    return re.findall(r'[^,]+,\s[^,]+(?:,)?', instructor)

# Initialize a new DataFrame list
expanded_data = []

# Process each row
for _, row in df.iterrows():
    instructors = split_instructors(row["Instructor"])  # Split correctly
    for instructor in instructors:
        expanded_data.append([instructor.strip(), row["Sub"], row["Crse"], row["Sec"], row["Day"], row["Time"], row["Room"]])


# Create a new DataFrame with split instructors
df_expanded = pd.DataFrame(expanded_data, columns=["Instructor", "Sub", "Crse", "Sec", "Day", "Time", "Room"])

# Export to a new CSV
df_expanded.to_csv(instructor_split_filename, index=False)

print(f"Exported to {instructor_split_filename}")

In [None]:
#Removing Commas and spaces under the instructor column and coverting instructor to lowercase

# Load the CSV with multiple instructors split
df = pd.read_csv(instructor_split_filename)

# Remove all commas and spaces from the 'Instructor' column to combine the letters into one string
df["Instructor"] = df["Instructor"].str.replace(",", "", regex=False).str.replace(" ", "", regex=False).str.lower()

# Export the updated DataFrame to a new CSV
df.to_csv(instructor_final_filename, index=False)

print(f"Final instructor course schedule exported to {instructor_final_filename}")

In [None]:
#time, name, and lower case formatting
# Load the final instructor course schedule CSV
df = pd.read_csv(instructor_final_filename)

# Function to convert time to military format
def format_time(time_str):
    if pd.isna(time_str) or time_str.strip().lower() == "tba":  # Handle missing or "TBA" values
        return "TBA"
    
    # Remove spaces and convert to lowercase for consistency
    time_str = time_str.lower().replace(" ", "")
    
    # Split start and end times by the dash
    time_parts = time_str.split('-')
    
    formatted_times = []
    for t in time_parts:
        try:
            military_time = datetime.strptime(t, "%I:%M%p").strftime("%H%M")  # Convert and remove colon
            formatted_times.append(military_time)
        except ValueError:
            return "TBA"  # Handle unexpected formats gracefully

    return ", ".join(formatted_times)  # Return as "HHMM, HHMM"

# Apply transformation to the Time column
df["Time"] = df["Time"].apply(format_time)
df["Instructor"] = df["Instructor"].str.replace("-", "", regex=True)  # Remove hyphens from instructor names

#Convert entire DataFrame to lowercase
df = df.apply(lambda col: col.map(lambda x: x.lower() if isinstance(x, str) else x))

# Export modified DataFrame to a new CSV
df.to_csv(instructor_final_formatted_filename, index=False)

print(f"Final instructor schedule with formatted time exported to {instructor_final_formatted_filename}")

In [None]:
# Load the CSV file
df = pd.read_csv(instructor_final_formatted_filename)

# Define the replacements
replacements = {
    "goddar": "goddard",
    "scienc": "science",
    "faic": "fine_arts",
    "combld": "communications",
    "sport": "sports",
    "cece": "the_center_for_early_childhood_education",
    "librar": "library",
    "planet plt": "planetarium"
}

# Apply the replacements to the 'Room' column
df["Room"] = df["Room"].replace(replacements, regex=True)

# Save the modified CSV
df.to_csv(instructor_final_formatted_filename, index=False)

print(f"Replacements applied and exported to {instructor_final_formatted_filename}")


In [None]:
df = pd.read_csv(instructor_final_formatted_filename)

# Remove stars and '/gym' from the 'Instructor' column
df["Instructor"] = df["Instructor"].str.replace(r"\*", "", regex=True)
df["Room"] = df["Room"].str.replace(r"/gym$", "", regex=True)

#Save modified
df.to_csv(instructor_final_formatted_filename, index=False)
print(f"Replacements applied and exported to {instructor_final_formatted_filename}")
 

In [41]:
#Combining into one string
# Load the final instructor course schedule CSV
df = pd.read_csv(instructor_final_formatted_filename)

# Function to combine the columns into a single string for each row with a space after each comma
def combine_columns(row):
    return f"{row['Instructor']}, {row['Sub']}{row['Crse']},{row['Sec']}, {row['Day']}, {row['Time']}, {row['Room']}"

# Apply the function to combine the columns for each row into one formatted string
df_combined = df.apply(combine_columns, axis=1)

# Create a new DataFrame with the combined strings
df_prolog_ready = pd.DataFrame(df_combined, columns=["Courses"])

# Sort the DataFrame alphabetically based on the first character of the 'Courses' column
df_prolog_ready = df_prolog_ready.sort_values(by="Courses", ascending=True).reset_index(drop=True)

# Export the combined and sorted DataFrame to a new CSV
df_prolog_ready.to_csv(prolog_ready_filename, index=False)

print(f"Exported to {prolog_ready_filename}")

Exported to finalFiles/testCoursesFall2025_prologReadyCourses_2025-03-12.csv


In [None]:
#NO TBA
# Load the prologReadyCourses_Sorted CSV
df = pd.read_csv(prolog_ready_filename)

# Filter out any rows containing 'TBA' in the 'Courses' column
df_filtered = df[~df["Courses"].str.contains("TBA", case=False, na=False)]

# Export the filtered DataFrame to a new CSV
df_filtered.to_csv(prolog_ready_no_tba_filename, index=False)

print(f"Filtered TBA and exported to {prolog_ready_no_tba_filename}")


GITHUB, BLANKS, Downloads, GITHUB REPOSITORY
DEVELOP WEB SCRAPER FOR OH
Done

2/21/25
Think about questions you want to answer with prolog. Facts and relationships you need to create!: Inprog
ADD days to Courses!: Done
Format names to be one string: Done
Format both to have consistent format : Done
Seperate Course times out(maybe): Done
Format all into lowercase : Done
Add in the sections and rooms : Done
Get rid of "-" in the instructor names: Done

Make user inputs user friendly: Extract current semester, would you like to go back to the last semester : Done
Make new CSV for Faculty Contanct Info : Done
Get rid of the *in staff for Fall 2025 : Done
Set up filenames(Filenaming Consistency) to be an input variable and going into folder according: Done
Update ReadME
Get rid of Gym Slash and seperate greenhouse line: Done


Format GitHub add folders