In [124]:
import os
import pandas as pd
import glob
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [125]:
input_directory = "csv_cleaned"
all_files = glob.glob(os.path.join(input_directory, "**/*.csv"), recursive=True)
combined_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

combined_df.head()

Unnamed: 0,INSTRUCTOR,COURSE,TERM,ENROLLED,RESP RATE,AVG GRADE RECEIVED,AVG HOURS WORKED,STUDENT LEARNING,COURSE STRUCTURE,CLASS ENVIRONMENT
0,Mayra Alejandra Cortes,AWP 10 - Lang&Learning in American Acad,S223,,,3.42,6.25,4.42,4.35,4.51
1,Joseph Albert Bourdeau,AWP 10 - Lang&Learning in American Acad,S223,,,3.82,6.0,4.5,4.53,4.75
2,Celine Joana Khoury,AWP 10 - Lang&Learning in American Acad,S223,,,3.75,5.91,4.52,4.8,4.93
3,Karen Marie Gocsik,AWP 10R - Lang&Learning in American Acad,WI24,11.0,36.36,3.2,5.5,4.56,4.63,4.5
4,Peter W Gilbert,AWP 3 - Analytical Writing,WI24,22.0,77.27,2.93,8.6,4.2,3.9,4.3


In [126]:
combined_df.dropna(inplace=True)

combined_df.head(20)

Unnamed: 0,INSTRUCTOR,COURSE,TERM,ENROLLED,RESP RATE,AVG GRADE RECEIVED,AVG HOURS WORKED,STUDENT LEARNING,COURSE STRUCTURE,CLASS ENVIRONMENT
3,Karen Marie Gocsik,AWP 10R - Lang&Learning in American Acad,WI24,11.0,36.36,3.2,5.5,4.56,4.63,4.5
4,Peter W Gilbert,AWP 3 - Analytical Writing,WI24,22.0,77.27,2.93,8.6,4.2,3.9,4.3
5,Graham T Hall,AWP 3 - Analytical Writing,WI24,16.0,93.75,3.07,8.56,4.75,4.86,4.94
6,Natalie Ann Wilson,AWP 3 - Analytical Writing,WI24,15.0,93.33,2.52,6.6,4.3,4.37,4.55
7,Melinda G Grant,AWP 3 - Analytical Writing,WI24,16.0,93.75,2.62,8.33,4.5,4.5,4.67
8,Geoffrey Scott West,AWP 3 - Analytical Writing,WI24,9.0,100.0,2.2,6.11,4.63,4.44,4.64
9,Alexandra Sartor,AWP 3 - Analytical Writing,WI24,16.0,87.5,2.66,6.43,4.14,4.29,4.54
10,Schuyler E. Eastin,AWP 3 - Analytical Writing,WI24,20.0,70.0,2.16,4.33,4.42,4.58,4.67
11,Samuel Kolodezh,AWP 3 - Analytical Writing,WI24,21.0,85.71,2.48,8.0,4.38,4.69,4.94
12,William Allan Given,AWP 3 - Analytical Writing,FA23,14.0,85.71,2.67,7.08,4.58,4.63,4.79


In [127]:
def weighted_average(df, values, weights):
    return (df[values] * df[weights]).sum() / df[weights].sum()


aggregated_df = (
    combined_df.groupby(["INSTRUCTOR", "COURSE"])
    .agg(
        TERM=("TERM", "first"),
        ENROLLED=("ENROLLED", "sum"),
        RESP_RATE=(
            "RESP RATE",
            lambda x: np.average(x, weights=combined_df.loc[x.index, "ENROLLED"]),
        ),
        AVG_GRADE_RECEIVED=(
            "AVG GRADE RECEIVED",
            lambda x: weighted_average(
                combined_df.loc[x.index], "AVG GRADE RECEIVED", "ENROLLED"
            ),
        ),
        AVG_HOURS_WORKED=(
            "AVG HOURS WORKED",
            lambda x: weighted_average(
                combined_df.loc[x.index], "AVG HOURS WORKED", "ENROLLED"
            ),
        ),
        STUDENT_LEARNING=(
            "STUDENT LEARNING",
            lambda x: weighted_average(
                combined_df.loc[x.index], "STUDENT LEARNING", "ENROLLED"
            ),
        ),
        COURSE_STRUCTURE=(
            "COURSE STRUCTURE",
            lambda x: weighted_average(
                combined_df.loc[x.index], "COURSE STRUCTURE", "ENROLLED"
            ),
        ),
        CLASS_ENVIRONMENT=(
            "CLASS ENVIRONMENT",
            lambda x: weighted_average(
                combined_df.loc[x.index], "CLASS ENVIRONMENT", "ENROLLED"
            ),
        ),
    )
    .reset_index()
)

aggregated_df.head(20)

Unnamed: 0,INSTRUCTOR,COURSE,TERM,ENROLLED,RESP_RATE,AVG_GRADE_RECEIVED,AVG_HOURS_WORKED,STUDENT_LEARNING,COURSE_STRUCTURE,CLASS_ENVIRONMENT
0,Aaron B. Coleman,BIBC 102 - Metabolic Biochemistry,FA23,221.0,56.56,3.15,7.22,4.4,4.36,4.54
1,Aaron B. Coleman,BIBC 103 - Biochemical Techniques,WI24,139.0,57.550791,3.191799,6.56223,4.591151,4.398201,4.617338
2,Aaron D Shalev,CSE 123 - Computer Networks,WI24,69.0,28.99,2.63,13.3,4.64,4.34,4.56
3,Aaron D Shalev,CSE 190 - Top/Computer Sci & Engineering,FA23,44.0,18.18,3.56,9.38,4.84,4.38,4.68
4,Aaron Daniel Finley,"HUM 4 - Enlightmnt,Romnt,Rev/1660-1848",S123,47.0,76.6,3.19,7.92,4.31,4.21,4.46
5,Aaron Drews,CENG 170 - Experimental Methods/ChemEng,FA23,68.0,33.82,3.13,7.61,4.31,4.07,4.56
6,Aaron Pollack,MATH 103B - Modern Algebra II,WI24,109.0,29.36,3.27,5.94,4.22,4.18,4.43
7,Aaron Rosengren,MAE 182 - Spacecraft Guidance&Navigation,WI24,22.0,59.09,3.73,7.46,3.9,3.98,4.12
8,Abhishek Saha,MAE 11 - Thermodynamics,FA23,115.0,82.61,3.04,8.47,4.22,4.12,4.42
9,Abigail Leslie Andrews,USP 162 - Migration and the City,FA23,22.0,77.27,3.74,6.18,4.88,4.71,4.65


In [128]:
factors = aggregated_df[
    ["STUDENT_LEARNING", "COURSE_STRUCTURE", "CLASS_ENVIRONMENT", "RESP_RATE"]
]

standard_scaler = StandardScaler()
standard_factors = standard_scaler.fit_transform(
    factors[["STUDENT_LEARNING", "COURSE_STRUCTURE", "CLASS_ENVIRONMENT"]]
)

min_max_scaler = MinMaxScaler(feature_range=(0, 5))
scaled_resp_rate = min_max_scaler.fit_transform(factors[["RESP_RATE"]])

scaled_factors = np.hstack((standard_factors, scaled_resp_rate))

pca = PCA(n_components=4)
pca.fit(scaled_factors)

explained_variance_ratios = pca.explained_variance_ratio_

aggregated_df["RATING"] = (
    explained_variance_ratios[0] * aggregated_df["STUDENT_LEARNING"]
    + explained_variance_ratios[1] * aggregated_df["COURSE_STRUCTURE"]
    + explained_variance_ratios[2] * aggregated_df["CLASS_ENVIRONMENT"]
    + explained_variance_ratios[3] * (aggregated_df["RESP_RATE"] / 20)
)

aggregated_df.head(20)

Unnamed: 0,INSTRUCTOR,COURSE,TERM,ENROLLED,RESP_RATE,AVG_GRADE_RECEIVED,AVG_HOURS_WORKED,STUDENT_LEARNING,COURSE_STRUCTURE,CLASS_ENVIRONMENT,RATING
0,Aaron B. Coleman,BIBC 102 - Metabolic Biochemistry,FA23,221.0,56.56,3.15,7.22,4.4,4.36,4.54,4.371011
1,Aaron B. Coleman,BIBC 103 - Biochemical Techniques,WI24,139.0,57.550791,3.191799,6.56223,4.591151,4.398201,4.617338,4.536045
2,Aaron D Shalev,CSE 123 - Computer Networks,WI24,69.0,28.99,2.63,13.3,4.64,4.34,4.56,4.537266
3,Aaron D Shalev,CSE 190 - Top/Computer Sci & Engineering,FA23,44.0,18.18,3.56,9.38,4.84,4.38,4.68,4.699995
4,Aaron Daniel Finley,"HUM 4 - Enlightmnt,Romnt,Rev/1660-1848",S123,47.0,76.6,3.19,7.92,4.31,4.21,4.46,4.296931
5,Aaron Drews,CENG 170 - Experimental Methods/ChemEng,FA23,68.0,33.82,3.13,7.61,4.31,4.07,4.56,4.243063
6,Aaron Pollack,MATH 103B - Modern Algebra II,WI24,109.0,29.36,3.27,5.94,4.22,4.18,4.43,4.170778
7,Aaron Rosengren,MAE 182 - Spacecraft Guidance&Navigation,WI24,22.0,59.09,3.73,7.46,3.9,3.98,4.12,3.901465
8,Abhishek Saha,MAE 11 - Thermodynamics,FA23,115.0,82.61,3.04,8.47,4.22,4.12,4.42,4.217495
9,Abigail Leslie Andrews,USP 162 - Migration and the City,FA23,22.0,77.27,3.74,6.18,4.88,4.71,4.65,4.828074


In [148]:
difficulty_factors = aggregated_df[
    ["AVG_GRADE_RECEIVED", "AVG_HOURS_WORKED", "RESP_RATE"]
]

standard_scaler = StandardScaler()
standard_difficulty_factors = standard_scaler.fit_transform(
    difficulty_factors[["AVG_GRADE_RECEIVED", "AVG_HOURS_WORKED"]]
)

min_max_scalar = MinMaxScaler(feature_range=(0, 5))
scaled_resp_rate = min_max_scalar.fit_transform(difficulty_factors[["RESP_RATE"]])

scaled_difficulty_factors = np.hstack((standard_difficulty_factors, scaled_resp_rate))

pca = PCA(n_components=3)
pca.fit(scaled_difficulty_factors)

difficulty_explained_variance_ratios = pca.explained_variance_ratio_

aggregated_df["DIFFICULTY_RAW"] = (
    difficulty_explained_variance_ratios[0] * aggregated_df["AVG_GRADE_RECEIVED"]
    + difficulty_explained_variance_ratios[1] * aggregated_df["AVG_HOURS_WORKED"]
    + difficulty_explained_variance_ratios[2] * (aggregated_df["RESP_RATE"] / 20)
)

min_max_difficulty = MinMaxScaler(feature_range=(0, 5))
aggregated_df["DIFFICULTY"] = min_max_difficulty.fit_transform(
    aggregated_df[["DIFFICULTY_RAW"]]
)

aggregated_df["DIFFICULTY"] = aggregated_df["DIFFICULTY"].astype(float)

aggregated_df.drop(columns=["DIFFICULTY_RAW", "DEPARTMENT"], inplace=True)

aggregated_df.head(20)

Unnamed: 0,INSTRUCTOR,COURSE,TERM,ENROLLED,RESP_RATE,AVG_GRADE_RECEIVED,AVG_HOURS_WORKED,STUDENT_LEARNING,COURSE_STRUCTURE,CLASS_ENVIRONMENT,RATING,DIFFICULTY
0,Aaron B. Coleman,BIBC 102 - Metabolic Biochemistry,FA23,221.0,56.56,3.15,7.22,4.4,4.36,4.54,4.371011,1.945485
1,Aaron B. Coleman,BIBC 103 - Biochemical Techniques,WI24,139.0,57.550791,3.191799,6.56223,4.591151,4.398201,4.617338,4.536045,1.775834
2,Aaron D Shalev,CSE 123 - Computer Networks,WI24,69.0,28.99,2.63,13.3,4.64,4.34,4.56,4.537266,3.247769
3,Aaron D Shalev,CSE 190 - Top/Computer Sci & Engineering,FA23,44.0,18.18,3.56,9.38,4.84,4.38,4.68,4.699995,2.544133
4,Aaron Daniel Finley,"HUM 4 - Enlightmnt,Romnt,Rev/1660-1848",S123,47.0,76.6,3.19,7.92,4.31,4.21,4.46,4.296931,2.383677
5,Aaron Drews,CENG 170 - Experimental Methods/ChemEng,FA23,68.0,33.82,3.13,7.61,4.31,4.07,4.56,4.243063,1.839459
6,Aaron Pollack,MATH 103B - Modern Algebra II,WI24,109.0,29.36,3.27,5.94,4.22,4.18,4.43,4.170778,1.36582
7,Aaron Rosengren,MAE 182 - Spacecraft Guidance&Navigation,WI24,22.0,59.09,3.73,7.46,3.9,3.98,4.12,3.901465,2.441961
8,Abhishek Saha,MAE 11 - Thermodynamics,FA23,115.0,82.61,3.04,8.47,4.22,4.12,4.42,4.217495,2.511624
9,Abigail Leslie Andrews,USP 162 - Migration and the City,FA23,22.0,77.27,3.74,6.18,4.88,4.71,4.65,4.828074,2.217006
