In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:
df = pd.read_csv("data/StudentsPerformance.csv")

In [17]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/150.0 MB 1.6 MB/s eta 0:01:37
   ---------------------------------------- 0.2/150.0 MB 2.1 MB/s eta 0:01:12
   ---------------------------------------- 0.3/150.0 MB 2.2 MB/s eta 0:01:08
   ---------------------------------------- 0.4/150.0 MB 2.0 MB/s eta 0:01:14
   ---------------------------------------- 0.4/150.0 MB 1.9 MB/s eta 0:01:20
   ---------------------------------------- 0.5/150.0 MB 1.9 MB/s eta 0:01:21
   ---------------------------------------- 0.6/150.0 MB 1.9 MB/s eta 0:01:18
   ---------------------------------------- 0.7/150.0 MB 1.9 MB/s eta 0:01:21
   ---------------------------------------- 0.8/150.0 MB 1.9 MB/s eta 0:01:19
   ---------------------------------------- 0.8/150.0 MB 1.8 MB/s eta 0:01:25
 

In [23]:
#Model Learning
from sklearn.metrics import accuracy_score,mean_absolute_error, mean_squared_error , r2_score
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor 
from sklearn.svm import SVR

In [31]:
df = pd.read_csv("data/Student_Performace_EDA.csv")

In [35]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [37]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,Total_Score,Average_Score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [43]:
#Independent and Dependent Variable
X = df.iloc[:,:-2]
y = df['Average_Score']

In [45]:
num_feature = X.select_dtypes(exclude='object').columns
cat_feature = X.select_dtypes(include='object').columns

In [49]:
ohencoder = OneHotEncoder(drop='first')
standardscaler = StandardScaler()
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder" , ohencoder , cat_feature),
        ("StandardScaler", standardscaler , num_feature)
    ]
)

In [51]:
X = preprocessor.fit_transform(X)

In [53]:
X.shape

(1000, 15)

In [57]:
#Train Test Split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [59]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((800, 15), (200, 15), (800,), (200,))

In [65]:
def model_evalution(true_value,predicated_value):
    mse= mean_squared_error(true_value,predicated_value)
    r2 = r2_score(true_value,predicated_value)
    return(mse,r2)


In [67]:
models={
    "LinearRegression" : LinearRegression(),
    "RandomForestReg" : RandomForestRegressor(),
    "XGBoostReg" : XGBRegressor(),
    "SVM Reg" : SVR(),
    "AdaBoostReg" : AdaBoostRegressor()
}

for model_name,model_object in models.items():
    model_object.fit(X_train,y_train)
    y_pred_train = model_object.predict(X_train)
    y_pred_test = model_object.predict(X_test)
    mse_test,r2_test = model_evalution(y_test,y_pred_test)
    mse_train,r2_train = model_evalution(y_train,y_pred_train)
    print(f"{model_name} Training mean squared error : {mse_train}")
    print(f"{model_name} Training r2 score : {r2_train}")
    print("-"*30)
    print(f"{model_name} Test mean squared error : {mse_test}")
    print(f"{model_name} Test r2 score : {r2_test}")
    print("*"*50)

LinearRegression Training mean squared error : 2.146963839649506e-28
LinearRegression Training r2 score : 1.0
------------------------------
LinearRegression Test mean squared error : 2.269395052139807e-28
LinearRegression Test r2 score : 1.0
**************************************************
RandomForestReg Training mean squared error : 0.08981093055555447
RandomForestReg Training r2 score : 0.9995529557656812
------------------------------
RandomForestReg Test mean squared error : 1.4488191666666632
RandomForestReg Test r2 score : 0.9931511508694965
**************************************************
XGBoostReg Training mean squared error : 0.00257149715750781
XGBoostReg Training r2 score : 0.9999872000771987
------------------------------
XGBoostReg Test mean squared error : 1.1375532072475005
XGBoostReg Test r2 score : 0.994622565414921
**************************************************
SVM Reg Training mean squared error : 10.244409535646167
SVM Reg Training r2 score : 0.9490072735