In [None]:
import pandas as pd
import numpy as np
import re

courses_df = pd.read_csv('all_slu_courses.csv')
courses_df

In [None]:
def extract_course_number_without_regex(title):
    parts = title.split()
    for part in parts:
        if part.isdigit() and len(part) == 4:
            return int(part)
    return None

courses_df['Course Number'] = courses_df['Course Title'].apply(extract_course_number_without_regex)
filtered_courses_df = courses_df[courses_df['Course Number'] < 5000]
filtered_courses_df = filtered_courses_df.drop(columns=['Course Number'])


In [None]:
def split_course_title(title):
    parts = title.split(" - ", 1)
    if len(parts) == 2:
        return parts[0], parts[1]
    return title, ""  # In case there is no " - " in the title

filtered_courses_df[['Course Name', 'Course Details']] = filtered_courses_df['Course Title'].apply(lambda x: pd.Series(split_course_title(x)))
filtered_courses_df = filtered_courses_df.drop(columns=['Course Title'])


In [None]:
import re

def process_credits(credits):
    repeatable = "Repeatable for credit" in credits
    # Extract all numeric values, including ranges (ie. 1-3 Credits)
    numeric_parts = re.findall(r'\d+', credits)
    # Convert numeric parts to integers and find the maximum value
    numeric_values = [int(part) for part in numeric_parts]
    max_credits = max(numeric_values) if numeric_values else None
    return max_credits, repeatable

filtered_courses_df[['Number of Credits', 'Repeatable']] = filtered_courses_df['Credits'].apply(lambda x: pd.Series(process_credits(x)))
filtered_courses_df['Repeatable'] = filtered_courses_df['Repeatable'].astype(bool)
filtered_courses_df = filtered_courses_df.drop(columns=['Credits'])


In [None]:
filtered_courses_df['Concurrent Enrollment'] = ~filtered_courses_df['Concurrent Enrollment'].isna()

In [None]:
prerequisites_cleaned = filtered_courses_df.query("'Course Name' == 'CSCI 2050'")['Prerequisites']
prerequisites_cleaned


In [None]:
filtered_courses_df.head(50)

In [None]:
out_fileName = 'cleaned_slu_courses.csv'
filtered_courses_df.to_csv(out_fileName, index=False)

In [None]:
courses_df = pd.read_csv('cleaned_slu_courses.csv')

def clean_prerequisites(prereq_str):
    if pd.isnull(prereq_str):
        return None
    # Remove quotation marks
    prereq_str = prereq_str.replace('"', '').replace("'", "")
    return prereq_str

courses_df['Prerequisites'] = courses_df['Prerequisites'].apply(clean_prerequisites)


In [None]:
courses_df = pd.read_csv('cleaned_slu_courses.csv')


# Define a function to clean the prerequisites
def clean_prerequisites(prereq_str):
    if pd.isnull(prereq_str):
        return None
    # Remove quotation marks
    prereq_str = prereq_str.replace('"', '').replace("'", "")
    # Split by comma
    prereq_list = prereq_str.split(',')
    # Strip whitespace and remove duplicates
    prereq_list = list(dict.fromkeys([prereq.strip() for prereq in prereq_list]))
    # Join back into a single string with commas
    return ', '.join(prereq_list)

# Apply the cleaning function to the 'Prerequisites' column
courses_df['Prerequisites'] = courses_df['Prerequisites'].apply(clean_prerequisites)



In [None]:
out_fileName = 'cleaned_slu_courses2.csv'
courses_df.to_csv(out_fileName, index=False)

In [None]:
import numpy as np
import pandas as pd

course = 'CSCI 2100'
courses_df = pd.read_csv('cleaned_slu_courses2.csv')

prerequisites_cleaned = courses_df.query('Course Name' == 'CSCI 2050')['Prerequisites']
prerequisites_cleaned


# # Replace NaN values with an empty string
# courses_df['Prerequisites'] = courses_df['Prerequisites'].replace(np.nan, '')

# # Function to clean and split prerequisites
# def clean_prerequisites(prerequisite_string):
#     if prerequisite_string:
#         # Replace non-breaking spaces with regular spaces
#         cleaned_string = prerequisite_string.replace('\xa0', ' ')
#         # Split the string into a list if there are multiple prerequisites
#         prerequisites_list = [prereq.strip() for prereq in cleaned_string.split(',')]
#         return prerequisites_list
#     return []

# # Apply the function to the 'Prerequisites' column
# courses_df['Prerequisites'] = courses_df['Prerequisites'].apply(clean_prerequisites)

# # Retrieve the cleaned prerequisites for 'CSCI 2100'

# course = 'CSCI 2050'
# # prerequisites_cleaned = courses_df[courses_df['Course Name'] == course]['Prerequisites'].values
# # prerequisites_cleaned = courses_df.loc[courses_df['Course Name'] == course, 'Prerequisites']
# # prerequisites_cleaned = courses_df.query("'Course Name' == 'CSCI 2050'")['Prerequisites']
# prerequisites_cleaned = courses_df[courses_df['Course Name'] == course].groupby('Course Name')['Prerequisites'].apply(list)

# print(prerequisites_cleaned)


In [None]:
import numpy as np
import pandas as pd

course = 'CSCI 2100'
file_path = 'cleaned_slu_courses2.csv'

# Load the CSV file
courses_df = pd.read_csv(file_path)

# Replace NaN values with an empty string
courses_df['Prerequisites'] = courses_df['Prerequisites'].replace(np.nan, '')

# Clean the 'Course Name' column by removing leading/trailing spaces and non-breaking spaces
courses_df['Course Name'] = courses_df['Course Name'].str.strip().str.replace('\xa0', ' ')

# Function to clean and split prerequisites
def clean_prerequisites(prerequisite_string):
    if prerequisite_string:
        # Replace non-breaking spaces with regular spaces
        cleaned_string = prerequisite_string.replace('\xa0', ' ')
        # Split the string into a list if there are multiple prerequisites
        prerequisites_list = [prereq.strip() for prereq in cleaned_string.split(',')]
        return prerequisites_list
    return []

# Apply the function to the 'Prerequisites' column
courses_df['Prerequisites'] = courses_df['Prerequisites'].apply(clean_prerequisites)

# Clean the target course name
cleaned_course = course.strip().replace('\xa0', ' ')

# Retrieve the cleaned prerequisites for the target course
prerequisites_cleaned = courses_df[courses_df['Course Name'] == cleaned_course]['Prerequisites']

# Check if the prerequisites series is empty
if not prerequisites_cleaned.empty:
    prerequisites_list = prerequisites_cleaned.values[0]
    print(prerequisites_list)
else:
    print(f"No prerequisites found for course {cleaned_course}")

In [26]:
import numpy as np
import pandas as pd


file_path = r'C:\Users\guffe\Desktop\Computer_Science\Projects\Optimal-Flowsheets\Projects\CSV_iterations\cleaned_slu_courses2.csv'

df = pd.read_csv(file_path)

print(df.dtypes)
# Print the data type of the 'Prerequisites' column
print(df['Prerequisites'].dtype)
# Print unique values in the 'Prerequisites' column
print(df['Prerequisites'].unique())

# Print the first 10 values in the 'Prerequisites' column
print(df['Prerequisites'].head(10))


Prerequisites            object
Concurrent Enrollment      bool
Major                    object
Course Name              object
Course Details           object
Credits                   int64
Repeatable                 bool
dtype: object
object
[nan 'BIZ\xa01000, BIZ\xa01001' 'ACCT\xa02200, BIZ\xa01002' 'ACCT\xa02200'
 'ACCT\xa02220, BTM\xa02500' 'ACCT\xa03110' 'ACCT\xa04110'
 'ACCT\xa03110, BTM\xa02000' 'ACCT\xa04110, ACCT\xa04250'
 'ACCT\xa03110, BTM\xa02500, BIZ\xa03000'
 'AENG\xa01002, MENG\xa01002, PHYS\xa01610' 'CORE\xa01000, CORE\xa01500'
 'AENG\xa02000, AENG\xa02020' 'AENG\xa03150' 'CSCI\xa01060'
 'MENG\xa03200, MENG\xa02310, MATH\xa03270' 'AENG\xa03230, MATH\xa03270'
 'MENG\xa03110' 'AENG\xa02910' 'AENG\xa03000, AENG\xa04400' 'AENG\xa04004'
 'AENG\xa03100' 'AENG\xa03000, AENG\xa04110' 'AENG\xa03230'
 'AENG\xa03000, AENG\xa03410' 'AENG\xa04400' 'AENG\xa04110' 'AENG\xa03910'
 'AES\xa01010, AES\xa02010' 'PSY\xa01010' 'ANTH\xa02210' 'ANTH\xa01200'
 'SOC\xa01100, SOC\xa01110, SOC\x

In [15]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(r'C:\Users\guffe\Desktop\Computer_Science\Projects\Optimal-Flowsheets\Projects\CSV_iterations\cleaned_slu_courses2.csv')

# Function to clean up and format prerequisites into a PostgreSQL-friendly array format
def format(prereqs):
    if pd.isna(prereqs) or prereqs.strip() == 'False':
        return '{}'  # Use empty array if prerequisites are NaN or False
    # Remove any surrounding quotes and ensure it is enclosed in curly braces
    prereqs = prereqs.replace('"', '').strip()
    if not prereqs.startswith('{'):
        prereqs = '{' + prereqs + '}'
    return prereqs


# Apply the transformations
df['Prerequisites'] = df['Prerequisites'].apply(format)
df['Course Details'] = df['Course Details'].apply(format)

# Save the modified dataframe to a new CSV file
df.to_csv('cleaned_courses3.csv', index=False, quotechar="'")


In [9]:
df.head(25)

Unnamed: 0,Prerequisites,Concurrent Enrollment,Major,Course Name,Course Details,Credits,Repeatable
0,{},False,Academic English Program (AEP),AEP 0100,Writing and Grammar Level 1: Beginner,4,False
1,{},False,Academic English Program (AEP),AEP 0120,Reading and Vocabulary Level 1: Beginner,4,False
2,{},False,Academic English Program (AEP),AEP 0130,Listening and Speaking Level 1: Beginner,4,False
3,{},False,Academic English Program (AEP),AEP 0200,Writing and Grammar Level 2: High Beginner,4,False
4,{},False,Academic English Program (AEP),AEP 0220,Reading and Vocabulary Level 2: High Beginner,4,False
5,{},False,Academic English Program (AEP),AEP 0230,Listening and Speaking Level 2: High Beginner,4,False
6,{},False,Academic English Program (AEP),AEP 0250,English through Service Level 2: High Beginner,1,False
7,{},False,Academic English Program (AEP),AEP 0300,Writing and Grammar Level 3: Intermediate,4,False
8,{},False,Academic English Program (AEP),AEP 0310,Exploring Cultures Level 3: Intermediate,3,False
9,{},False,Academic English Program (AEP),AEP 0320,Reading and Vocabulary Level 3: Intermediate,3,False


In [18]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(r'C:\Users\guffe\Desktop\Computer_Science\Projects\Optimal-Flowsheets\Projects\CSV_iterations\cleaned_slu_courses2.csv', encoding='utf-8')


# Make sure to use the exact column name, adjust 'Course Name' if needed
filtered_df = df[df['Course Name'].str.strip() == 'ACCT 2200']  # str.strip() helps remove any leading/trailing spaces

# Check if the filtered dataframe is empty
if filtered_df.empty:
    print("No data found for ACCT 2200.")
else:
    # Print the prerequisites for the course
    for index, row in filtered_df.iterrows():
        print(f"Course Name: {row['Course Name']}, Prerequisites: {row['Prerequisites']}")


No data found for ACCT 2200.


In [1]:

import pandas as pd

# Load the CSV file
file_path = 'path_to_your_csv_file.csv'
courses_df = pd.read_csv(file_path)

# Convert the 'prerequisites' column into a single string for each row
courses_df['prerequisites'] = courses_df['prerequisites'].apply(lambda x: ';'.join(x) if isinstance(x, list) else x)

# Save the modified DataFrame back to a CSV file
modified_file_path = 'modified_slu_courses.csv'
courses_df.to_csv(modified_file_path, index=False)

print(f"Modified CSV saved to {modified_file_path}")


ModuleNotFoundError: No module named 'pandas'