### Model Training.
#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [2]:
# Basic Import 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [18]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,GroupKFold,train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

In [19]:
# importing Data.

df = pd.read_csv('data/stud.csv')

In [20]:
# showing 5 rows.
df.sample(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
61,male,group A,some high school,free/reduced,none,39,39,34
733,male,group D,some high school,standard,none,55,47,44
586,female,group A,high school,standard,none,55,73,73
527,female,group C,high school,free/reduced,none,36,53,43
176,female,group B,high school,free/reduced,completed,46,54,58


In [21]:
# separating the Dependent and Independent Value.
X = df.drop('math_score',axis=1)
y = df['math_score']

In [22]:
num_features = [col for col in X.columns if df[col].dtype != 'object']
cat_features = [col for col in X.columns if df[col].dtype == 'object']

# num_features = X.select_dtypes(exclude='object).columns
# cat_features = X.select_dtypes(include='object).columns

In [23]:
num_transformer = StandardScaler()
ohe = OneHotEncoder()
preprocessor = ColumnTransformer([
    ("OneHotEncoder",ohe,cat_features),
    ("StandardScaler",num_transformer,num_features)
],remainder='passthrough')

In [24]:
X = preprocessor.fit_transform(X)

In [25]:
X.shape

(1000, 19)

In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [31]:
def evaluate_metrics(true,pred):
    """This Function Take input(True,pred) and return MAE,MSE,R2_SCORE,RMSE"""
    mae = mean_absolute_error(true,pred)
    mse = mean_squared_error(true,pred)
    r2 = r2_score(true,pred)
    rmse = np.sqrt(mse)
    return mae,mse,r2,rmse
    

In [32]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_score_list = []

for name,model in models.items():
    model.fit(X_train,y_train)
    
    # Make Prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate metrics.
    train_MAE,train_MSE,train_r2,train_RMSE=evaluate_metrics(y_train,y_train_pred)
    test_MAE,test_MSE,test_r2,test_RMSE=evaluate_metrics(y_test,y_test_pred)
    
    print(name)
    print('Model performance for Training set')
    print("- Mean Squared Error: {:.4f}".format(train_MSE))
    print("- Root Mean Squared Error: {:.4f}".format(train_RMSE))
    print("- Mean Absolute Error: {:.4f}".format(train_MAE))
    print("- R2 Score: {:.4f}".format(train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Mean Squared Error: {:.4f}".format(test_MSE))
    print("- Root Mean Squared Error: {:.4f}".format(test_RMSE))
    print("- Mean Absolute Error: {:.4f}".format(test_MAE))
    print("- R2 Score: {:.4f}".format(test_r2))
    r2_score_list.append(test_r2)
    
    print('='*35)
    print('\n')
    

Linear Regression
Model performance for Training set
- Mean Squared Error: 28.3808
- Root Mean Squared Error: 5.3274
- Mean Absolute Error: 4.2788
- R2 Score: 0.8741
----------------------------------
Model performance for Test set
- Mean Squared Error: 29.2638
- Root Mean Squared Error: 5.4096
- Mean Absolute Error: 4.2259
- R2 Score: 0.8797


Lasso
Model performance for Training set
- Mean Squared Error: 43.4784
- Root Mean Squared Error: 6.5938
- Mean Absolute Error: 5.2063
- R2 Score: 0.8071
----------------------------------
Model performance for Test set
- Mean Squared Error: 42.5064
- Root Mean Squared Error: 6.5197
- Mean Absolute Error: 5.1579
- R2 Score: 0.8253


Ridge
Model performance for Training set
- Mean Squared Error: 28.3378
- Root Mean Squared Error: 5.3233
- Mean Absolute Error: 4.2650
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Mean Squared Error: 29.0563
- Root Mean Squared Error: 5.3904
- Mean Absolute Error: 4.2111
- R2

In [35]:
pd.DataFrame(list(zip(models.keys(),r2_score_list)),columns=['Model Name','R2_Score']).sort_values(by='R2_Score',ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.87974
8,AdaBoost Regressor,0.854368
5,Random Forest Regressor,0.854141
7,CatBoosting Regressor,0.851632
6,XGBRegressor,0.827797
1,Lasso,0.82532
3,K-Neighbors Regressor,0.783898
4,Decision Tree,0.733622
