In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#for models
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
df =pd.read_csv('data/stud.csv')
df.head()


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
X=df.drop(columns=['math_score'],axis=1)
y=df['math_score']

In [4]:
X.head()


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [5]:
numerical_features=[f for f in df.columns if df[f].dtype!='O']
categorical_features=[f for f in df.columns if df[f].dtype=='O']
numerical_features.remove('math_score')

In [6]:
numeric_transformer=StandardScaler()
ohe_transformer=OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",ohe_transformer,categorical_features),
        ("StandardScaler",numeric_transformer,numerical_features)
    ]
)

In [7]:
X=preprocessor.fit_transform(X)

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [13]:
def eval_model(actual,predicted):
    mae=mean_absolute_error(actual,predicted)
    mse=mean_squared_error(actual,predicted)
    rmse=np.sqrt(mse)
    r2_scr=r2_score(actual,predicted)
    return mae,mse,rmse,r2_scr

In [15]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list=[]
r2_list=[]

# Loop directly through dict items (cleaner than indexing)
for name, model in models.items():
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_mae,train_mse ,train_rmse, train_r2 = eval_model(y_train, y_train_pred)
    test_mae, train_mse,test_rmse, test_r2 = eval_model(y_test, y_test_pred)

    # Store results
    model_list.append(name)
    r2_list.append(test_r2)

    # Print nicely
    print(f"\n{name}")
    print("Training set performance:")
    print(f"- RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, R²: {train_r2:.4f}")
    
    print("Test set performance:")
    print(f"- RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}, R²: {test_r2:.4f}")
    print("="*40)



Linear Regression
Training set performance:
- RMSE: 5.2972, MAE: 4.2383, R²: 0.8743
Test set performance:
- RMSE: 5.4825, MAE: 4.3379, R²: 0.8778

Lasso
Training set performance:
- RMSE: 6.5515, MAE: 5.1837, R²: 0.8077
Test set performance:
- RMSE: 6.6541, MAE: 5.2217, R²: 0.8200

Ridge
Training set performance:
- RMSE: 5.2976, MAE: 4.2368, R²: 0.8743
Test set performance:
- RMSE: 5.4788, MAE: 4.3354, R²: 0.8780

K-Neighbors Regressor
Training set performance:
- RMSE: 5.7920, MAE: 4.5864, R²: 0.8497
Test set performance:
- RMSE: 7.1327, MAE: 5.5208, R²: 0.7932

Decision Tree
Training set performance:
- RMSE: 0.2887, MAE: 0.0200, R²: 0.9996
Test set performance:
- RMSE: 7.6333, MAE: 6.0600, R²: 0.7632

Random Forest Regressor
Training set performance:
- RMSE: 2.2898, MAE: 1.8230, R²: 0.9765
Test set performance:
- RMSE: 6.0847, MAE: 4.7506, R²: 0.8495

XGBRegressor
Training set performance:
- RMSE: 0.8783, MAE: 0.5772, R²: 0.9965
Test set performance:
- RMSE: 6.3464, MAE: 4.9605, R²: 0