In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#Reading the data, adding an intercept column, and getting the x and y values
clean_data = pd.read_csv('clean_data.csv')
clean_data_dummies = pd.get_dummies(clean_data, columns=["NEU_Colleges"], drop_first=True)
X = clean_data_dummies[["Level of Difficulty (Out of 5)", "Would Take Again (Percent)"] + [col for col in clean_data_dummies.columns if "NEU_Colleges_" in col]].values.astype(float)
y = clean_data_dummies["Average Rating (Out of 5)"].values.astype(float)
#Training and testing, had to reshape
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_real = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_real = np.c_[np.ones(X_test.shape[0]), X_test]
#Finding the different coefficients for the categories
coefficients = np.linalg.inv(X_train_real.T @ X_train_real) @ X_train_real.T @ y_train
intercept = coefficients[0]
weights = coefficients[1:]
y_pred = X_test_real @ coefficients
#calculating the r2 and mse values
square_sum = np.sum((y_test - np.mean(y_test))**2)
square_sum_resid = np.sum((y_test - y_pred)**2)
r2_score = 1 - (square_sum_resid / square_sum)
mse = np.mean((y_test - y_pred)**2)
#printing everything
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2_score_numpy:.4f}")
print("Coefficients:")
for i, col in enumerate(["Level of Difficulty (Out of 5)", "Would Take Again (Percent)"] + [col for col in clean_data_dummies.columns if "NEU_Colleges_" in col]):
    print(f"{col}: {weights[i]:.4f}")
print(f"Intercept: {intercept:.4f}")


Mean Squared Error (MSE): 0.3860
R² Score: 0.7278
Coefficients:
Level of Difficulty (Out of 5): -0.1325
Would Take Again (Percent): 2.5786
NEU_Colleges_College of Arts, Media, and Design (CAMD): 0.0305
NEU_Colleges_College of Engineering (COE): 0.0415
NEU_Colleges_College of Professional Studies (CPS): 0.0343
NEU_Colleges_College of Science (COS): -0.0292
NEU_Colleges_College of Social Sciences and Humanities (CSSH): 0.0270
NEU_Colleges_D’Amore-McKim School of Business (DMSB): 0.0585
NEU_Colleges_Khoury College of Computer Sciences: -0.0778
NEU_Colleges_School of Law: -0.1695
Intercept: 2.3520


Ok so basically here is what the values mean, so r2 and mse are just that but the intercept is that if everything is a 0 and college isn't considered, the teacher would be a 2.3, which is around 2.5 which is what we want. For the colleges, it implies that, with all else equal, being in that college will affect the average score by that number. For the level of difficulty it means that a professors average score is -0.13 for every extra difficult level they add. FOr the would take again %, it means that for every increase in unit, the professors score will increase by 2.578. This is slightly misleading, since the real value would be 0.025786, because for example a 1% increase is really 0.01 * 2.57 not 1*2.57.

In [27]:
def predict_average_rating(difficulty, would_take_again, college, coefficients):
    """
    Predicts the average rating based on difficulty, would-take-again percentage, and college.

    Parameters:
        difficulty (float): The level of difficulty (out of 5).
        would_take_again (float): The percentage of students willing to take the class again (as a fraction, ex., 0.8 for 80%).
        college (string): The name of the college (e.g., 'NEU_Colleges_Khoury College of Computer Sciences').
        coefficients (dict): Dictionary of model coefficients, including intercept and feature weights.

    Returns:
        float: Predicted average rating.
    """
    
    prediction = coefficients["Intercept"]
    prediction += coefficients["Level of Difficulty (Out of 5)"] * difficulty
    prediction += coefficients["Would Take Again (Percent)"] * would_take_again
    college_key = f"NEU_Colleges_{college}"
    if college_key in coefficients:
        prediction += coefficients[college_key]
    else:
        raise ValueError(f"Invalid college name: {college}")
    
    return prediction


In [29]:
coefficients = {
    'Level of Difficulty (Out of 5)': -0.1325,
    'Would Take Again (Percent)': 2.5786,
    'NEU_Colleges_College of Arts, Media, and Design (CAMD)': 0.0305,
    'NEU_Colleges_College of Engineering (COE)': 0.0415,
    'NEU_Colleges_College of Professional Studies (CPS)': 0.0343,
    'NEU_Colleges_College of Science (COS)': -0.0292,
    'NEU_Colleges_College of Social Sciences and Humanities (CSSH)': 0.0270,
    'NEU_Colleges_D’Amore-McKim School of Business (DMSB)': 0.0585,
    'NEU_Colleges_Khoury College of Computer Sciences': -0.0778,
    'NEU_Colleges_School of Law': -0.1695,
    'Intercept': 2.3520
}

predicted_rating = predict_average_rating(
    difficulty=4.0, 
    would_take_again=0.85,
    college="Khoury College of Computer Sciences", 
    coefficients=coefficients
)

print(f"Predicted Average Rating: {predicted_rating:.4f}")


Predicted Average Rating: 3.9360
