In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer

In [3]:

#!pip install xgboost

In [4]:
#!pip install scikit-learn


In [5]:
df=pd.read_csv('raw.csv')

In [6]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [8]:
df.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [9]:
df.shape

(1000, 8)

In [10]:
df['total']=df['math_score']+df['reading_score']+df['writing_score']

In [11]:
df['Average']=df['total']/3

In [12]:
x=df.iloc[:,:9]
y=df.iloc[:,9]

In [13]:
x.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total
0,female,group B,bachelor's degree,standard,none,72,72,74,218
1,female,group C,some college,standard,completed,69,90,88,247
2,female,group B,master's degree,standard,none,90,95,93,278
3,male,group A,associate's degree,free/reduced,none,47,57,44,148
4,male,group C,some college,standard,none,76,78,75,229


In [14]:
y.head()

0    72.666667
1    82.333333
2    92.666667
3    49.333333
4    76.333333
Name: Average, dtype: float64

In [15]:
numeric=x.select_dtypes('number').columns
category=x.select_dtypes('object').columns

In [16]:
Scaler=StandardScaler()
OH=OneHotEncoder()

preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",OH,category),
        ("StandardScaler",Scaler,numeric)
    ]
)

In [17]:
x=preprocessor.fit_transform(x)

In [18]:
x

array([[ 1.        ,  0.        ,  0.        , ...,  0.19399858,
         0.39149181,  0.34357423],
       [ 1.        ,  0.        ,  0.        , ...,  1.42747598,
         1.31326868,  1.0219275 ],
       [ 1.        ,  0.        ,  0.        , ...,  1.77010859,
         1.64247471,  1.74706375],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.12547206,
        -0.20107904, -0.19443008],
       [ 1.        ,  0.        ,  0.        , ...,  0.60515772,
         0.58901542,  0.46053169],
       [ 1.        ,  0.        ,  0.        , ...,  1.15336989,
         1.18158627,  1.06871048]], shape=(1000, 21))

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [20]:
def evaluate_model(true,predict):
    mae=mean_absolute_error(true,predict)
    mse=mean_squared_error(true,predict)
    r2=r2_score(true,predict)
    n=x.shape[0]
    p=x.shape[1]
    ad_r2=1-(1-r2)*(n-1)/(n-p-1)
    rmse=np.sqrt(mean_squared_error(true,predict))
    
    return mae,mse,r2,ad_r2,rmse

In [21]:
lr=LinearRegression()

In [22]:
model=lr.fit(x_train,y_train)

In [23]:
y_pred_lr=model.predict(x_test)

In [24]:
mae,mse,r2,ad_r2,rmse=evaluate_model(y_test,y_pred_lr)

In [25]:
print("mse : ",mse)
print("mae : ",mae)
print("rmse : ",rmse)
print("r2 : ",r2)
print("ad_r2 : ",ad_r2)


mse :  2.7868878059647903e-28
mae :  1.2448708730516955e-14
rmse :  1.6693974379891658e-14
r2 :  1.0
ad_r2 :  1.0


In [26]:
re=Ridge()
model=re.fit(x_train,y_train)
y_pred_lr=model.predict(x_test)
y_pred_train=model.predict(x_train)
mae,mse,r2,ad_r2,rmse=evaluate_model(y_test,y_pred_lr)
t_mae,t_mse,t_r2,t_ad_r2,t_rmse=evaluate_model(y_train,y_pred_train)
print("mse : ",mse)
print("mae : ",mae)
print("rmse : ",rmse)
print("r2 : ",r2)
print("ad_r2 : ",ad_r2)

print("######################################################################")


print("mse : ",t_mse)
print("mae : ",t_mae)
print("rmse : ",t_rmse)
print("r2 : ",t_r2)
print("ad_r2 : ",t_ad_r2)

mse :  4.885230330248393e-05
mae :  0.0054028156056665465
rmse :  0.006989442274064786
r2 :  0.9999997816420086
ad_r2 :  0.9999997769533401
######################################################################
mse :  3.9666518574925646e-05
mae :  0.005094032990440259
rmse :  0.00629813611911696
r2 :  0.9999997964456028
ad_r2 :  0.9999997920748028


In [27]:
le=Lasso()
model=le.fit(x_train,y_train)
y_pred_lr=model.predict(x_test)
y_pred_train=model.predict(x_train)
mae,mse,r2,ad_r2,rmse=evaluate_model(y_test,y_pred_lr)
t_mae,t_mse,t_r2,t_ad_r2,t_rmse=evaluate_model(y_train,y_pred_train)
print("mse : ",mse)
print("mae : ",mae)
print("rmse : ",rmse)
print("r2 : ",r2)
print("ad_r2 : ",ad_r2)

print("######################################################################")


print("mse : ",t_mse)
print("mae : ",t_mae)
print("rmse : ",t_rmse)
print("r2 : ",t_r2)
print("ad_r2 : ",t_ad_r2)

mse :  1.224464653966232
mae :  0.8651700062115358
rmse :  1.1065553099444383
r2 :  0.9945269388692881
ad_r2 :  0.9944094191517575
######################################################################
mse :  1.042072777303299
mae :  0.8196979380621727
rmse :  1.0208196595399694
r2 :  0.9946524549243604
ad_r2 :  0.9945376303368466


In [28]:
le=KNeighborsRegressor(n_neighbors=11)
# paramgrid={
#     'n_neighbors':[3,5,7,9,11,15,19],
#     'weights':['uniform','distance'],
#     'n_jobs':[1]
# }
# cv=KFold(n_splits=5)
# grid=GridSearchCV(le,param_grid=paramgrid,cv=cv,scoring='neg_mean_squared_error')
model=le.fit(x_train,y_train)
#print(model.best_params_)
y_pred_lr=model.predict(x_test)
y_pred_train=model.predict(x_train)
mae,mse,r2,ad_r2,rmse=evaluate_model(y_test,y_pred_lr)
t_mae,t_mse,t_r2,t_ad_r2,t_rmse=evaluate_model(y_train,y_pred_train)
print("mse : ",mse)
print("mae : ",mae)
print("rmse : ",rmse)
print("r2 : ",r2)
print("ad_r2 : ",ad_r2)

print("######################################################################")


print("mse : ",t_mse)
print("mae : ",t_mae)
print("rmse : ",t_rmse)
print("r2 : ",t_r2)
print("ad_r2 : ",t_ad_r2)

mse :  7.789285583103766
mae :  1.884969696969697
rmse :  2.790929161247875
r2 :  0.9651837756011897
ad_r2 :  0.9644361879607245
######################################################################
mse :  3.7425344352617085
mae :  1.4465050505050507
rmse :  1.9345631122456843
r2 :  0.9807946507906227
ad_r2 :  0.9803822659916483


In [30]:
numeric

Index(['math_score', 'reading_score', 'writing_score', 'total'], dtype='object')

In [31]:
category

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')