In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
%matplotlib inline
pd.options.display.float_format = '{:,}'.format

#reading raw data
data_raw = pd.read_csv('/kaggle/input/student-performance-data-set/student-por.csv')
display(data_raw.head())

#creating catagorical columns list and numeric columns list
cat_columns = ['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian',
               'schoolsup','famsup','paid','activities','nursery','higher','internet','romantic']
num_columns = ['Medu','Fedu','traveltime','studytime','famrel','freetime','goout','Dalc','Walc','health']
cont_columns = ['age','failures','absences','G1','G2','G3']

#### In this kernel I'll fit a regression model on the student-performance-data-set only, to understand the data and its columns refer to [Student Performance Data Visualization](https://www.kaggle.com/mostafafathy4869/student-performance-data-visualization)

# Data Preprocessing

##### Most of the Machine learning algorithms can not handle categorical variables unless we convert them to numerical values. Many algorithm’s performances vary based on how Categorical variables are encoded. [All about Categorical Variable Encoding](https://towardsdatascience.com/all-about-categorical-variable-encoding-305f3361fd02)

##### For this reason I'll be using 3 method for encoding catagorical variables
* **One Hot Encoding**
* **Label Encoder**
* **Leave One Out Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
from category_encoders import LeaveOneOutEncoder

#Creating different dataset copies for different label encoder
#OneHotEncoding
dummy_df = data_raw.copy()
#target column
target = dummy_df.pop('G3')
#LeaveOneOutEncoding
loo_df = dummy_df.copy()
#LabelEncoder
le_df = dummy_df.copy()

#Creating Encoder
loo = LeaveOneOutEncoder()
le = LabelEncoder()
#Encoding Catagorical Variables
for col in cat_columns:
    loo_df[col] = loo.fit_transform(loo_df[col],target)
    le_df[col] = le.fit_transform(le_df[col])
    dummy_df = pd.get_dummies(dummy_df,columns=[col],drop_first=True)

# Model Selection

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor

from xgboost import XGBRegressor

In [None]:
def ml(data,target,model,pr=False):
    X_train,X_test,y_train,y_test = train_test_split(data, target, random_state=0)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    r2_square = metrics.r2_score(y_test, y_pred)
    if pr==True:
        print('MAE:', mae)
        print('MSE:', mse)
        print('RMSE:', rmse)
        print('R2 Square', r2_square)
        print('__________________________________')
    return model,mae,mse,rmse,r2_square

MLA = [
    LinearRegression(),
    SVR(kernel='rbf'),
    SVR(kernel='poly'),
    LGBMRegressor(),
    CatBoostRegressor(verbose=False),
    XGBRegressor(),
    GradientBoostingRegressor(),
    RandomForestRegressor()
]

loo_models = {}
le_models = {}
dummy_models = {}

In [None]:
%%time
#Creating DataFrame to save model performance
evaluation_metrics = ['MAE','MSE','RMSE','R2_Square']
loo_df_performance = pd.DataFrame(columns=evaluation_metrics,)
le_df_performance = pd.DataFrame(columns=evaluation_metrics)
dummy_df_performance = pd.DataFrame(columns=evaluation_metrics)

for alg in MLA:
    _name = alg.__class__.__name__
    if _name == 'SVR':
        _name = f'{_name}_{alg.kernel}'
    
    loo_model, loo_mae, loo_mse, loo_rmse, loo_r2_square = ml(loo_df,target,alg)
    le_model, le_mae, le_mse, le_rmse, le_r2_square = ml(le_df,target,alg)
    dummy_model, dummy_mae, dummy_mse, dummy_rmse, dummy_r2_square = ml(dummy_df,target,alg)
    
    loo_models[_name] = loo_model
    le_models[_name] = le_model
    dummy_models[_name] = dummy_model
    
    loo_df_performance = loo_df_performance.append(pd.Series({'MAE':loo_mae ,'MSE':loo_mse ,'RMSE':loo_r2_square ,'R2_Square':loo_r2_square},name=_name))
    le_df_performance = le_df_performance.append(pd.Series({'MAE':le_mae ,'MSE':le_mse ,'RMSE':le_rmse ,'R2_Square':le_r2_square},name=_name))
    dummy_df_performance = dummy_df_performance.append(pd.Series({'MAE':dummy_mae ,'MSE':dummy_mse ,'RMSE':dummy_rmse ,'R2_Square':dummy_r2_square},name=_name))

In [None]:
sortting = 'MAE'
print('Dummies Dataset')
display(dummy_df_performance.sort_values(sortting))
print('LabelEncoder Dataset')
display(le_df_performance.sort_values(sortting))
print('LeaveOneOut Encoder Dataset')
display(loo_df_performance.sort_values(sortting))

In [None]:
#rfr,_,_,_,_ = ml(loo_df,target,RandomForestRegressor())
#plot_data = zip(loo_df.columns,rfr.feature_importances_)
#plot_data = sorted(plot_data, key=lambda x:x[1], reverse=True)
#x = [col for col,val in plot_data[:5]]
#y = [val for col,val in plot_data[:5]]
#plt.figure(figsize=(6,4))
#plt.bar(x,y)

##### RandomForestRegressor comes at the top for each dataset with XGBRregressor perform the best for LeaveOneOut Encoder Dataset, I will choose both algorithems to try hyperparameter tuning with optuna

# Hyperparameter tuning

In [None]:
import optuna as opt

# XGBRegressor

In [None]:
def objective_xgbr(trial):
    
    xgb_params = {}
    xgb_params['eval_metric'] = 'rmse'
    xgb_params['eta'] = trial.suggest_uniform('eta', 0.05, 0.6)
    xgb_params['max_depth'] = trial.suggest_int('max_depth', 2,15)
    xgb_params['subsample'] = trial.suggest_uniform('subsample', 0.2,1)
    
    
    
    
    X_train,X_test,y_train,y_test = train_test_split(loo_df, target, random_state=0)
    model = XGBRegressor(**xgb_params)
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
    return rmse

In [None]:
%%time
study = opt.create_study( direction='minimize')
study.optimize(objective_xgbr, n_trials=1000)

In [None]:
opt.visualization.plot_optimization_history(study)

In [None]:
print('Tuned Model')
_=ml(loo_df,target,XGBRegressor(eta=0.27565560438172737,max_depth=4,subsample=0.2631374852738718),pr=True)
print('Defaul Model')
_=ml(loo_df,target,XGBRegressor(),pr=True)

##### We can see huge improving about 0.043 for tuning model which gives 0.995 R2 score

# RandomForestRegressor

In [None]:
def objective_rfr(trial):
    
    rfr_params = {}
    rfr_params['n_estimators'] = trial.suggest_int('n_estimators',50,800)
    rfr_params['max_depth'] = trial.suggest_int('max_depth', 1, 100)
    rfr_params['max_features'] = trial.suggest_int('max_features', 1, 20)
    rfr_params['min_samples_leaf'] = trial.suggest_int('min_samples_leaf', 2, 50)
    rfr_params['min_samples_split'] = trial.suggest_int('min_samples_split', 2, 50)
    
    
    
    X_train,X_test,y_train,y_test = train_test_split(loo_df, target, random_state=0)
    model = RandomForestRegressor(**rfr_params)
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
    return rmse

In [None]:
%%time
study = opt.create_study( direction='minimize')
study.optimize(objective_rfr, n_trials=1000)

In [None]:
opt.visualization.plot_optimization_history(study)

In [None]:
best_params = {'n_estimators': 68,
                 'max_depth': 55,
                 'max_features': 8,
                 'min_samples_leaf': 3,
                 'min_samples_split': 5}

In [None]:
print('Tuned Model')
_=ml(loo_df,target,RandomForestRegressor(**best_params),pr=True)
print('Defaul Model')
_=ml(loo_df,target,RandomForestRegressor(),pr=True)

##### we can see a 0.01 increase after tuning random forest regressor

# Final performance test

##### I'll take 200 entries from the data set to act as test dataset for our model and see how the model will perform with it

In [None]:
train,test,train_y,test_y = train_test_split(loo_df,target,random_state=0,test_size=200)

X_train,X_test,y_train,y_test = train_test_split(train,train_y,random_state=0)

xgbr = XGBRegressor(eta=0.27565560438172737,
                     max_depth=4,
                     subsample=0.2631374852738718)


xgbr.fit(X_train,y_train)
y_pred = xgbr.predict(test)
mae = metrics.mean_absolute_error(test_y, y_pred)
mse = metrics.mean_squared_error(test_y, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(test_y, y_pred))
r2_square = metrics.r2_score(test_y, y_pred)
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)
print('R2 Square', r2_square)
print('__________________________________')

##### The r2 score drops to 0.97 but it is not that bad, and the model is pretty good for this dataset.

##### Feel free to play around with the dataset and telling me what you think