## [1.1] LIBRARIES

In [71]:
#Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Data preprocessing
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV,train_test_split

#Model building
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from catboost import CatBoostRegressor
#Model evaluation
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

#warning library
from warnings import filterwarnings
filterwarnings('ignore')

## [1.2] UTILITY FUNCTION

In [83]:
def print_report(method,y_true,y_pred):
    mse=mean_squared_error(y_true,y_pred)
    MAE=mean_absolute_error(y_true,y_pred)
    r2_scores=r2_score(y_true,y_pred)
    print('===============================')
    print('reprot of:',method)
    print('The mean squared error is :',mse)
    print('The mean absolute error is :',MAE)
    print('The r2 score :',r2_scores)
    print('===============================')



## [2] DATA

In [14]:
df=pd.read_csv(r'C:\Users\Pritam\Desktop\DATA SCIENCE PROJECTS+NOTES+MATERIAL\ML PROJECTS\MLproject\Notebook\data\stud.csv')

In [18]:
df.sample(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
620,female,group C,high school,free/reduced,none,35,61,54
494,female,group B,high school,standard,none,54,64,68
357,female,group C,some college,free/reduced,completed,42,66,69
664,male,group D,associate's degree,standard,none,80,63,63
533,female,group E,associate's degree,standard,completed,79,88,94


## [3] PROBLEM STATEMENT

`From the given dataset we have to predict the maths score from other variables`

## [4] TRAIN TEST SPLIT

In [22]:
X=df.drop(columns='math_score',axis='column')
y=df['math_score']

In [23]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [24]:
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

- Lets first find the numerical features and categorical features in the data.

In [44]:
num_features=[i for i in X.columns if X[i].dtype!='O']
cat_features=[i for i in X.columns if X[i].dtype=='O']

## [5] FEATURE ENGINEERING

- Lets write the transformation pipelines


In [52]:
oh=OneHotEncoder()
std_scaler=StandardScaler()

In [53]:
transformation_pipeline=ColumnTransformer([('oh',oh,cat_features),
                                            ('stdss',std_scaler,num_features)])

In [54]:
transformation_pipeline.fit(X_train)

In [60]:
X_train=transformation_pipeline.transform(X_train)
X_test=transformation_pipeline.transform(X_test)

## [6] MODELLING

In [85]:
models={'Knn':KNeighborsRegressor(),
        'LR':LinearRegression(),
        'DTR':DecisionTreeRegressor(),
        'Random_FR':RandomForestRegressor(),
        'Ridge_reg':Ridge(),
        'Lasso_reg':Lasso(),
        'SVR':SVR(),
        'Adaboost_reg':AdaBoostRegressor(),
        'Cat_reg':CatBoostRegressor(verbose=False)
        }

In [90]:

for i in models:
    models[i].fit(X_train,y_train)
    y_pred=models[i].predict(X_test)
    y_train_pred=models[i].predict(X_train)
    print('Reprot on training data')
    print_report(i,y_train,y_train_pred)
    print('Reprot on test data')
    print_report(i,y_test,y_pred)
    print('\n')

Reprot on training data
reprot of: Knn
The mean squared error is : 32.46085
The mean absolute error is : 4.50425
The r2 score : 0.8560159149035753
Reprot on test data
reprot of: Knn
The mean squared error is : 52.088
The mean absolute error is : 5.587999999999999
The r2 score : 0.7859441830800236


Reprot on training data
reprot of: LR
The mean squared error is : 28.467392578125
The mean absolute error is : 4.27171875
The r2 score : 0.8737293855385155
Reprot on test data
reprot of: LR
The mean squared error is : 29.1508984375
The mean absolute error is : 4.213125
The r2 score : 0.8802042816197526


Reprot on training data
reprot of: DTR
The mean squared error is : 0.078125
The mean absolute error is : 0.01875
The r2 score : 0.9996534669718089
Reprot on test data
reprot of: DTR
The mean squared error is : 64.17
The mean absolute error is : 6.29
The r2 score : 0.7362931621149806


Reprot on training data
reprot of: Random_FR
The mean squared error is : 5.2544102118055545
The mean absolut

- We will choose the linear models

In [91]:
lr=LinearRegression()

In [94]:
lr.fit(X_train,y_train)

In [95]:
y_pred=lr.predict(X_test)

In [99]:
print_report('Linear regression',y_test,y_pred)

reprot of: Linear regression
The mean squared error is : 29.1508984375
The mean absolute error is : 4.213125
The r2 score : 0.8802042816197526
