### Model Building 
- in this notebook we creating the machine learning model 


### 1. Importing the Dependencies 

In [34]:
# import required libraries 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor , AdaBoostRegressor  
from sklearn.neighbors import KNeighborsRegressor  
from sklearn.linear_model import LinearRegression , Ridge ,Lasso 
from sklearn.metrics import r2_score , confusion_matrix ,mean_absolute_error ,mean_squared_error 
from sklearn.svm import SVR 
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV
from xgboost import XGBRegressor 
from catboost import CatBoostRegressor
import warnings 
warnings.filterwarnings('ignore')


- Importing pandas dataframe

In [19]:
data = pd.read_csv("dataset/StudentsPerformance.csv")
data.sample(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
314,female,group C,bachelor's degree,standard,completed,59,64,75
290,male,group C,associate's degree,standard,none,76,70,68
205,male,group D,some high school,standard,completed,74,71,78
770,male,group B,high school,standard,none,52,48,49
985,male,group A,high school,standard,none,57,51,54


- we are predicting the math_Score  
### Preparing X & Y from data 

In [20]:
X = data.drop('math score',axis=1)

In [21]:
y = data['math score']

In [26]:
# creating the column transformer 
num_features = X.select_dtypes(exclude="O").columns
cat_feature = X.select_dtypes(include="O").columns 


# importing column transformer 
from sklearn.preprocessing import StandardScaler , OneHotEncoder 
from sklearn.compose import ColumnTransformer 

preprocessor = ColumnTransformer(
    [
        ("OneHotencoder" , OneHotEncoder() , cat_feature) , 
        ("standardscaler" , StandardScaler() , num_features)
    ]
)

In [27]:
X = preprocessor.fit_transform(X)

In [29]:
# splitting the data into training and testing set 
x_train ,x_test ,y_train ,y_test = train_test_split(X,y , test_size=0.25 , random_state=43)

In [30]:
x_train.shape , x_test.shape,y_train.shape , y_test.shape

((750, 19), (250, 19), (750,), (250,))

In [39]:
# creating the function which help us to give complete matrix evalution of model 
def see_performance(y_true , y_prediction) : 
    mae = mean_squared_error(y_true , y_prediction)
    mse = mean_absolute_error(y_true , y_prediction) 
    rmse = np.sqrt(mse) 
    r2 = r2_score(y_true , y_prediction) 

    return r2 , mse , mae , rmse
'''
    result = f"""the performance matrix results
            mean squared error : {mse} 
            mean absolute error :{mae} 
            root mean squared error :{rmse}
            r2_score of matrix :{r2}"""'''
    

'\n    result = f"""the performance matrix results\n            mean squared error : {mse} \n            mean absolute error :{mae} \n            root mean squared error :{rmse}\n            r2_score of matrix :{r2}"""'

In [47]:
# model training 
models ={ 
    "linearRegression":LinearRegression() ,
    "Lasso" :Lasso() , 
    "Ridge" : Ridge() , 
    "k-nearest_Neighbor":KNeighborsRegressor() , 
    "Decision tree" : DecisionTreeRegressor() , 
    "Random forest" : RandomForestRegressor() , 
    "xgboost" : XGBRegressor() , 
    "catboost" : CatBoostRegressor(verbose=False) , 
    "adaboost" : AdaBoostRegressor()
}

model_list = [] 
r2_squared_list = [] 

for i in range(len(models)): 
    model = list(models.values())[i] 
    model.fit(x_train , y_train) 


    y_pred = model.predict(x_test) 
    y_pred_train = model.predict(x_train)  
    r2_train , mse_train , mae_train , rmse_train  =  see_performance(y_train , y_pred_train)    
    r2 , mse , mae , rmse =  see_performance(y_test , y_pred) 
    print("--"*30)
    print(f"{list(models.keys())[i]} Regression model ")
    result = f"""the performance matrix results of test datset 
            mean squared error : {mse} 
            mean absolute error :{mae} 
            root mean squared error :{rmse}
            r2_score of matrix :{r2}""" 
    print(result)  
    print("=="*30)
    result = f"""the performance matrix results of train dataset 
            mean squared error : {mse_train} 
            mean absolute error :{mae_train} 
            root mean squared error :{rmse_train}
            r2_score of matrix :{r2_train}""" 
    print(result)
    print("--"*30)

------------------------------------------------------------
linearRegression Regression model 
the performance matrix results of test datset 
            mean squared error : 4.419748526651196 
            mean absolute error :29.90074959115574 
            root mean squared error :2.1023197964751215
            r2_score of matrix :0.8629199185258796
the performance matrix results of train dataset 
            mean squared error : 4.201293223308026 
            mean absolute error :27.982782313871237 
            root mean squared error :2.049705643088301
            r2_score of matrix :0.8799076289062795
------------------------------------------------------------
------------------------------------------------------------
Lasso Regression model 
the performance matrix results of test datset 
            mean squared error : 5.416742320652253 
            mean absolute error :46.35239463233846 
            root mean squared error :2.3273895936547135
            r2_score of matrix :0

In [38]:
r2_squared_list

[]