In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#modeling libraries
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings




In [80]:
df=pd.read_csv('C:\\Users\\admin\\Desktop\\MACHINE_LEARNING_CODES\\ML_PROJECT\\notebook\\data\\StudentsPerformance1.csv')

In [81]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total_score,Average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [82]:
x=df.drop(columns=["Average_score"],axis=1)

In [83]:
x

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218
1,female,group C,some college,standard,completed,69,90,88,247
2,female,group B,master's degree,standard,none,90,95,93,278
3,male,group A,associate's degree,free/reduced,none,47,57,44,148
4,male,group C,some college,standard,none,76,78,75,229
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,282
996,male,group C,high school,free/reduced,none,62,55,55,172
997,female,group C,high school,free/reduced,completed,59,71,65,195
998,female,group D,some college,standard,completed,68,78,77,223


In [84]:
y=df["Average_score"]

In [85]:
y

0      72.666667
1      82.333333
2      92.666667
3      49.333333
4      76.333333
         ...    
995    94.000000
996    57.333333
997    65.000000
998    74.333333
999    83.000000
Name: Average_score, Length: 1000, dtype: float64

In [86]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total_score,Average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [87]:
#create Column Transformer with 3 types of transformers
num_features =x.select_dtypes(exclude='object').columns
cat_features = x.select_dtypes(include='object').columns

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
numeric_features=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
[
    ("OneHotEncoder",oh_transformer,cat_features),
    ("StandardScaler",numeric_features,num_features)
]
)

In [88]:
x=preprocessor.fit_transform(x)

In [89]:
x

array([[ 1.        ,  0.        ,  0.        , ...,  0.19399858,
         0.39149181,  0.34357423],
       [ 1.        ,  0.        ,  0.        , ...,  1.42747598,
         1.31326868,  1.0219275 ],
       [ 1.        ,  0.        ,  0.        , ...,  1.77010859,
         1.64247471,  1.74706375],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.12547206,
        -0.20107904, -0.19443008],
       [ 1.        ,  0.        ,  0.        , ...,  0.60515772,
         0.58901542,  0.46053169],
       [ 1.        ,  0.        ,  0.        , ...,  1.15336989,
         1.18158627,  1.06871048]], shape=(1000, 21))

In [90]:
x.shape

(1000, 21)

In [91]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [92]:
x_train.shape,x_test.shape

((700, 21), (300, 21))

In [93]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_score1=r2_score(true,predicted)
    return mae,mse,rmse,r2_score1


In [97]:
models={
"linear regression": LinearRegression(),
"lasso": Lasso(),
"ridge": Ridge(),
"knn": KNeighborsRegressor(),
"dt": DecisionTreeRegressor(),
"rfr": RandomForestRegressor(),
"xg": XGBRegressor(),
"cat": CatBoostRegressor(verbose=False),
"AdaBoost": AdaBoostRegressor()
}

# re-initialize or reset lists for this run
models_list = []
r2_list = []

for model_name, model in models.items():
    try:
        # train model
        model.fit(x_train, y_train)

        # make prediction
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)

        # Evaluate Train and Test dataset
        model_train_mae, model_train_mse, model_train_rmse, model_train_R2 = evaluate_model(y_train, y_train_pred)
        model_test_mae, model_test_mse, model_test_rmse, model_test_R2 = evaluate_model(y_test, y_test_pred)

        print(model_name)
        models_list.append(model_name)
        r2_list.append(model_test_R2)  # Append the R2 score to the list

        print("MODEL PERFORMANCE FOR TRAINING SET:")
        print(f"MAE: {model_train_mae:.4f}")
        print(f"MSE: {model_train_mse:.4f}")
        print(f"RMSE: {model_train_rmse:.4f}")
        print(f"R2: {model_train_R2:.4f}")

        print("-------------------------------------")

        print("MODEL PERFORMANCE FOR TEST SET:")
        print(f"MAE: {model_test_mae:.4f}")
        print(f"MSE: {model_test_mse:.4f}")
        print(f"RMSE: {model_test_rmse:.4f}")
        print(f"R2: {model_test_R2:.4f}")

        print("*************************************")

    except Exception as e:
        # catch and report any error but continue with other models
        print(f"Error with model '{model_name}': {e}")
        models_list.append(model_name)
        r2_list.append(np.nan)
        print("*************************************")

linear regression
MODEL PERFORMANCE FOR TRAINING SET:
MAE: 0.0000
MSE: 0.0000
RMSE: 0.0000
R2: 1.0000
-------------------------------------
MODEL PERFORMANCE FOR TEST SET:
MAE: 0.0000
MSE: 0.0000
RMSE: 0.0000
R2: 1.0000
*************************************
lasso
MODEL PERFORMANCE FOR TRAINING SET:
MAE: 0.8232
MSE: 1.0562
RMSE: 1.0277
R2: 0.9945
-------------------------------------
MODEL PERFORMANCE FOR TEST SET:
MAE: 0.8870
MSE: 1.2605
RMSE: 1.1227
R2: 0.9944
*************************************
ridge
MODEL PERFORMANCE FOR TRAINING SET:
MAE: 0.0054
MSE: 0.0000
RMSE: 0.0067
R2: 1.0000
-------------------------------------
MODEL PERFORMANCE FOR TEST SET:
MAE: 0.0058
MSE: 0.0001
RMSE: 0.0074
R2: 1.0000
*************************************
knn
MODEL PERFORMANCE FOR TRAINING SET:
MAE: 1.5362
MSE: 3.9919
RMSE: 1.9980
R2: 0.9792
-------------------------------------
MODEL PERFORMANCE FOR TEST SET:
MAE: 1.9300
MSE: 7.2573
RMSE: 2.6939
R2: 0.9678
*************************************
dt
MOD

In [98]:
R2_score_dataframe=pd.DataFrame(list(zip(models_list,r2_list)),columns=["Model Name","R2_Score"]).sort_values(by=["R2_Score"],ascending=False)

In [99]:
R2_score_dataframe


Unnamed: 0,Model Name,R2_Score
0,linear regression,1.0
2,ridge,1.0
5,rfr,0.99679
4,dt,0.996599
6,xg,0.996036
7,cat,0.994527
1,lasso,0.994408
8,AdaBoost,0.992013
3,knn,0.967808
