In [19]:
import pandas as pd
import numpy as np
import re

courses_df = pd.read_csv('all_slu_courses.csv')
courses_df

Unnamed: 0,Course Title,Credits,Prerequisites,Concurrent Enrollment,Major
0,AEP 0100 - Writing and Grammar Level 1: Beginner,Credit(s): 4 Credits,,,Academic English Program (AEP)
1,AEP 0120 - Reading and Vocabulary Level 1: Beg...,Credit(s): 4 Credits,,,Academic English Program (AEP)
2,AEP 0130 - Listening and Speaking Level 1: Beg...,Credit(s): 4 Credits,,,Academic English Program (AEP)
3,AEP 0200 - Writing and Grammar Level 2: High B...,Credit(s): 4 Credits,,,Academic English Program (AEP)
4,AEP 0220 - Reading and Vocabulary Level 2: Hig...,Credit(s): 4 Credits,,,Academic English Program (AEP)
...,...,...,...,...,...
6393,WGST 6790X - Feminist Approaches to Social Wor...,Credit(s): 3 Credits,,,Women's and Gender Studies (WGST)
6394,WGST 6833 - Employment Discrimination,Credit(s): 3 Credits,,,Women's and Gender Studies (WGST)
6395,WGST 6875 - Family Law,Credit(s): 3 Credits,,,Women's and Gender Studies (WGST)
6396,WGST 6930 - Special Topics: Women Studies,Credit(s): 3 Credits (Repeatable for credit),,,Women's and Gender Studies (WGST)


In [20]:
def extract_course_number_without_regex(title):
    parts = title.split()
    for part in parts:
        if part.isdigit() and len(part) == 4:
            return int(part)
    return None

courses_df['Course Number'] = courses_df['Course Title'].apply(extract_course_number_without_regex)
filtered_courses_df = courses_df[courses_df['Course Number'] < 5000]
filtered_courses_df = filtered_courses_df.drop(columns=['Course Number'])


In [21]:
def split_course_title(title):
    parts = title.split(" - ", 1)
    if len(parts) == 2:
        return parts[0], parts[1]
    return title, ""  # In case there is no " - " in the title

filtered_courses_df[['Course Name', 'Course Details']] = filtered_courses_df['Course Title'].apply(lambda x: pd.Series(split_course_title(x)))
filtered_courses_df = filtered_courses_df.drop(columns=['Course Title'])


In [22]:
import re

def process_credits(credits):
    repeatable = "Repeatable for credit" in credits
    # Extract all numeric values, including ranges (ie. 1-3 Credits)
    numeric_parts = re.findall(r'\d+', credits)
    # Convert numeric parts to integers and find the maximum value
    numeric_values = [int(part) for part in numeric_parts]
    max_credits = max(numeric_values) if numeric_values else None
    return max_credits, repeatable

filtered_courses_df[['Number of Credits', 'Repeatable']] = filtered_courses_df['Credits'].apply(lambda x: pd.Series(process_credits(x)))
filtered_courses_df['Repeatable'] = filtered_courses_df['Repeatable'].astype(bool)
filtered_courses_df = filtered_courses_df.drop(columns=['Credits'])


In [24]:
filtered_courses_df['Concurrent Enrollment'] = ~filtered_courses_df['Concurrent Enrollment'].isna()

In [25]:
filtered_courses_df.head(50)

Unnamed: 0,Prerequisites,Concurrent Enrollment,Major,Course Name,Course Details,Number of Credits,Repeatable
0,,False,Academic English Program (AEP),AEP 0100,Writing and Grammar Level 1: Beginner,4,False
1,,False,Academic English Program (AEP),AEP 0120,Reading and Vocabulary Level 1: Beginner,4,False
2,,False,Academic English Program (AEP),AEP 0130,Listening and Speaking Level 1: Beginner,4,False
3,,False,Academic English Program (AEP),AEP 0200,Writing and Grammar Level 2: High Beginner,4,False
4,,False,Academic English Program (AEP),AEP 0220,Reading and Vocabulary Level 2: High Beginner,4,False
5,,False,Academic English Program (AEP),AEP 0230,Listening and Speaking Level 2: High Beginner,4,False
6,,False,Academic English Program (AEP),AEP 0250,English through Service Level 2: High Beginner,1,False
7,,False,Academic English Program (AEP),AEP 0300,Writing and Grammar Level 3: Intermediate,4,False
8,,False,Academic English Program (AEP),AEP 0310,Exploring Cultures Level 3: Intermediate,3,False
10,,False,Academic English Program (AEP),AEP 0320,Reading and Vocabulary Level 3: Intermediate,3,False


In [26]:
out_fileName = 'cleaned_slu_courses.csv'
filtered_courses_df.to_csv(out_fileName, index=False)