In [61]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import r2_score,accuracy_score,confusion_matrix,classification_report,mean_absolute_error,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

In [62]:
df=pd.read_csv("stud.csv")

In [63]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [64]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [65]:
x=df.drop(columns=['math_score'],axis=1)

In [66]:
y=df['math_score']

In [67]:
x

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [68]:
categorical_feature=[column for column in df.columns if df[column].dtype=='O']
numerical_feature=[column for column in x.columns if df[column].dtype!='O']

In [69]:
x,y

(     gender race_ethnicity parental_level_of_education         lunch  \
 0    female        group B           bachelor's degree      standard   
 1    female        group C                some college      standard   
 2    female        group B             master's degree      standard   
 3      male        group A          associate's degree  free/reduced   
 4      male        group C                some college      standard   
 ..      ...            ...                         ...           ...   
 995  female        group E             master's degree      standard   
 996    male        group C                 high school  free/reduced   
 997  female        group C                 high school  free/reduced   
 998  female        group D                some college      standard   
 999  female        group D                some college  free/reduced   
 
     test_preparation_course  reading_score  writing_score  
 0                      none             72             74  


In [70]:
def metrics_evaluation(y_predict,ytest):
    mse=mean_squared_error(ytest,y_predict)
    r2=r2_score(ytest,y_predict)
    mae=mean_absolute_error(ytest,y_predict)
    return mse,r2,mae

In [71]:
models={
    "LinearRegression":LinearRegression(),
    "LogisticRegression":LogisticRegression(),
    "Ridge":Ridge(),
    "SVR":SVR(),
    "NaiveBayes":GaussianNB(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "XGBRegressor":XGBRegressor()
}

In [72]:
ohe=OneHotEncoder()
ss=StandardScaler()
preprocessing=ColumnTransformer(
    [
    ("OneHotEncoder",ohe,categorical_feature),
    ("StandardScaler",ss,numerical_feature)
    ]
)

In [73]:
x=preprocessing.fit_transform(x)

In [74]:
x=pd.DataFrame(data=x,columns=preprocessing.get_feature_names_out())


In [75]:
x.shape

(1000, 19)

In [76]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y)

In [77]:
df.isna().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [78]:
x.iloc[750:]

Unnamed: 0,OneHotEncoder__gender_female,OneHotEncoder__gender_male,OneHotEncoder__race_ethnicity_group A,OneHotEncoder__race_ethnicity_group B,OneHotEncoder__race_ethnicity_group C,OneHotEncoder__race_ethnicity_group D,OneHotEncoder__race_ethnicity_group E,OneHotEncoder__parental_level_of_education_associate's degree,OneHotEncoder__parental_level_of_education_bachelor's degree,OneHotEncoder__parental_level_of_education_high school,OneHotEncoder__parental_level_of_education_master's degree,OneHotEncoder__parental_level_of_education_some college,OneHotEncoder__parental_level_of_education_some high school,OneHotEncoder__lunch_free/reduced,OneHotEncoder__lunch_standard,OneHotEncoder__test_preparation_course_completed,OneHotEncoder__test_preparation_course_none,StandardScaler__reading_score,StandardScaler__writing_score
750,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-0.011581,-0.003555
751,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.193999,-0.201079
752,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.536631,0.457333
753,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.427476,1.115745
754,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.970952,-0.991174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.044215,1.774157
996,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.970952,-0.859491
997,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.125472,-0.201079
998,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.605158,0.589015


In [79]:
for model in models.keys():
    initialize=models[model]
    initialize.fit(xtrain,ytrain)
    print(f"For {model}")
    y_predict_train=initialize.predict(xtrain)
    y_predict_test=initialize.predict(xtest)
    print(f"for train")
    msetr,r2tr,maetr=metrics_evaluation(ytrain,y_predict_train)
    print(f"mean_squared_error for {model} train: {msetr}")
    print(f"r2_score for {model} train: {r2tr}")
    print(f"mean_absolute_error for {model} train: {maetr}")

    print("--------------------------------------------")
    print(f"for test")
    msete,r2te,maete=metrics_evaluation(ytest,y_predict_test)
    print(f"mean_squared_error for {model} test: {msete}")
    print(f"r2_score for {model} test: {r2te}")
    print(f"mean_absolute_error for {model} test: {maete}")
    print("---------------------------------------------")
    print()
    print()




For LinearRegression
for train
mean_squared_error for LinearRegression train: 27.796207356770832
r2_score for LinearRegression train: 0.8628632544728445
mean_absolute_error for LinearRegression train: 4.2145625
--------------------------------------------
for test
mean_squared_error for LinearRegression test: 30.5203505859375
r2_score for LinearRegression test: 0.840474693808369
mean_absolute_error for LinearRegression test: 4.3570625
---------------------------------------------


For LogisticRegression
for train
mean_squared_error for LogisticRegression train: 66.71466666666667
r2_score for LogisticRegression train: 0.6067765538652465
mean_absolute_error for LogisticRegression train: 5.914666666666666
--------------------------------------------
for test
mean_squared_error for LogisticRegression test: 71.708
r2_score for LogisticRegression test: 0.5208554436883404
mean_absolute_error for LogisticRegression test: 6.66
---------------------------------------------


For Ridge
for train