In [245]:
# Loasklearn.linear_modelimport pandas as pd
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# Cross validation
from sklearn.model_selection import cross_validate

# Bayes opt
from bayes_opt import BayesianOptimization

# Linear Regression with Lasso
from sklearn.linear_model import Lasso
# Linear Regression with L2
from sklearn.linear_model import Ridge
# Random Forest
from sklearn.ensemble import RandomForestRegressor
# XGBoost
import xgboost as xgb
# LightGBM
import lightgbm as lgbm

# Scoring
from sklearn.metrics import mean_squared_error

In [2]:
train_data  = pd.read_csv("../data/processed/train.csv")

In [3]:
train_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,4,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,2,11
2,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,0,14
3,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,0,13
4,GP,M,16,U,LE3,T,4,3,services,other,...,yes,no,5,4,2,1,2,5,6,13


In [4]:
X_train = train_data.drop(["G3"], axis = 1)
y_train = train_data["G3"]

In [5]:
X_train.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences'],
      dtype='object')

In [7]:
# Defining transformation steps
numeric_transformer = Pipeline(steps=[
    # For standard scaling of data
    ('scaler', StandardScaler())
    ])
categorical_transformer = Pipeline(steps=[
    # OHE for categorical data
    ('onehot', OneHotEncoder(drop = "first"))])

In [41]:
# Identify numerical vs categorical features
categorical_features = X_train.loc[:,("school","sex", "address", "famsize", "Pstatus", "Mjob", "Fjob", "reason", 
                                  "guardian","schoolsup", "famsup", "paid","activities","nursery", "higher", 
                                  "internet","romantic")].columns

numeric_features = X_train.loc[:,("age", "Medu", "Fedu", "traveltime", "studytime", "failures", "famrel", 
                                    "freetime", "goout", "Dalc", "Walc", "health", "absences")].columns


In [42]:
categorical_features

Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
       'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'higher', 'internet', 'romantic'],
      dtype='object')

In [43]:
numeric_features

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences'],
      dtype='object')

In [67]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('ohe', OneHotEncoder(drop = "first"), categorical_features)
    ])

In [68]:
# Convert to dataframe
X_train_trans = pd.DataFrame(preprocessor.fit_transform(X_train),
                            index = X_train.index,
                             columns = (list(numeric_features) +
                                       list(preprocessor.named_transformers_['ohe'].get_feature_names(categorical_features))))

# Linear Model Ridge

In [129]:
lmridge_params = {'alpha':(0,100)}

def cv_mse_lmridge(alpha):
    """ 
    Performs cross validation for LM regressor with Ridge regression. To be used for Bayesian optimiser maximizer function.
    
    Parameters
    ----------
    alpha : float
        L2 regularisation constant

    Returns
    -------
    float
        Cross validation score based on negative mean squared error.
        
    """
    estimator = Ridge(alpha)

    # Note that neg_mean_squared_error is opposite, thus a negative sign is added for Minimisation Optimisation via BayesOpt
    return cross_validate(estimator, X_train_trans, y_train, cv = 10, scoring = "neg_root_mean_squared_error")["test_score"].mean()

In [130]:
optimizer_lmridge = BayesianOptimization(cv_mse_lmridge, lmridge_params)
optimizer_lmridge.maximize(n_iter = 20)

|   iter    |  target   |   alpha   |
-------------------------------------
| [0m 1       [0m | [0m-2.735   [0m | [0m 72.45   [0m |
| [0m 2       [0m | [0m-2.746   [0m | [0m 13.06   [0m |
| [0m 3       [0m | [0m-2.737   [0m | [0m 94.96   [0m |
| [95m 4       [0m | [95m-2.735   [0m | [95m 55.57   [0m |
| [0m 5       [0m | [0m-2.739   [0m | [0m 28.27   [0m |
| [95m 6       [0m | [95m-2.735   [0m | [95m 63.32   [0m |
| [0m 7       [0m | [0m-2.735   [0m | [0m 63.32   [0m |
| [95m 8       [0m | [95m-2.735   [0m | [95m 61.72   [0m |
| [0m 9       [0m | [0m-2.735   [0m | [0m 68.61   [0m |
| [0m 10      [0m | [0m-2.735   [0m | [0m 55.83   [0m |
| [0m 11      [0m | [0m-2.735   [0m | [0m 70.74   [0m |
| [0m 12      [0m | [0m-2.735   [0m | [0m 56.32   [0m |
| [0m 13      [0m | [0m-2.735   [0m | [0m 70.37   [0m |
| [0m 14      [0m | [0m-2.735   [0m | [0m 56.45   [0m |
| [0m 15      [0m | [0m-2.735   [0m | [0m 

In [131]:
optimizer_lmridge.max

{'target': -2.73504227729081, 'params': {'alpha': 61.716008089193764}}

# Linear Model Lasso

In [132]:
lmlasso_params = {'alpha':(0,100)}

def cv_mse_lmlasso(alpha):
    """ 
    Performs cross validation for LM regressor with Lasso regression. To be used for Bayesian optimiser maximizer function.
    
    Parameters
    ----------
    alpha : float
        L1 regularisation constant

    Returns
    -------
    float
        Cross validation score based on negative mean squared error.
        
    """
    estimator = Lasso(alpha)

    # Note that neg_mean_squared_error is opposite, thus a negative sign is added for Minimisation Optimisation via BayesOpt
    return cross_validate(estimator, X_train_trans, y_train, cv = 10, scoring = "neg_root_mean_squared_error")["test_score"].mean()

In [133]:
optimizer_lmlasso = BayesianOptimization(cv_mse_lmlasso, lmlasso_params, random_state = 1)
optimizer_lmlasso.maximize(n_iter = 20)

|   iter    |  target   |   alpha   |
-------------------------------------
| [0m 1       [0m | [0m-3.2     [0m | [0m 70.12   [0m |
| [0m 2       [0m | [0m-3.2     [0m | [0m 85.44   [0m |
| [0m 3       [0m | [0m-3.2     [0m | [0m 31.35   [0m |
| [0m 4       [0m | [0m-3.2     [0m | [0m 46.36   [0m |
| [0m 5       [0m | [0m-3.2     [0m | [0m 62.58   [0m |
| [95m 6       [0m | [95m-2.757   [0m | [95m 0.003969[0m |


  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)


| [0m 7       [0m | [0m-2.77    [0m | [0m 0.0     [0m |
| [0m 8       [0m | [0m-2.98    [0m | [0m 0.638   [0m |
| [0m 9       [0m | [0m-3.2     [0m | [0m 19.45   [0m |
| [0m 10      [0m | [0m-3.2     [0m | [0m 93.82   [0m |
| [0m 11      [0m | [0m-3.2     [0m | [0m 54.56   [0m |
| [0m 12      [0m | [0m-3.2     [0m | [0m 38.55   [0m |
| [0m 13      [0m | [0m-3.2     [0m | [0m 9.394   [0m |
| [0m 14      [0m | [0m-3.2     [0m | [0m 78.23   [0m |
| [0m 15      [0m | [0m-3.2     [0m | [0m 100.0   [0m |
| [0m 16      [0m | [0m-3.2     [0m | [0m 25.4    [0m |
| [0m 17      [0m | [0m-3.2     [0m | [0m 14.42   [0m |
| [0m 18      [0m | [0m-3.2     [0m | [0m 5.097   [0m |
| [0m 19      [0m | [0m-3.2     [0m | [0m 89.63   [0m |
| [0m 20      [0m | [0m-3.2     [0m | [0m 50.46   [0m |
| [0m 21      [0m | [0m-3.2     [0m | [0m 74.17   [0m |
| [0m 22      [0m | [0m-3.2     [0m | [0m 58.57   [0m |
| [0m 2

# Random Forest

In [134]:
# SKLearn Random Forest
rf_params = {'n_estimators':(10,150), 'max_depth':(10,200), 'max_features':(2, 30)}

def cv_mse_rf(n_estimators,max_depth, max_features):
    """ 
    Performs cross validation for Random Forest Regressor. To be used for Bayesian optimiser maximizer function.
    
    Parameters
    ----------
    n_estimators : float
        Number of estimators for random forest
    max_depth : float
        Max depth of trees in random forest
    max_features : float
        Max number of features in random forest

    Returns
    -------
    float
        Cross validation score based on negative mean squared error.
        
    """
    # Convert chosen hyperparams to discrete integer
    max_depth = int(max_depth)
    max_features = int(max_features)
    n_estimators = int(n_estimators)
    
    estimator = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)

    # Note that neg_mean_squared_error is opposite, thus a negative sign is added for Minimisation Optimisation via BayesOpt
    return cross_validate(estimator, X_train_trans, y_train, cv = 10, scoring = "neg_root_mean_squared_error")["test_score"].mean()

In [135]:
optimizer_rf = BayesianOptimization(cv_mse_rf, rf_params)
optimizer_rf.maximize(n_iter = 60)

|   iter    |  target   | max_depth | max_fe... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m-2.714   [0m | [0m 14.78   [0m | [0m 21.4    [0m | [0m 41.23   [0m |
| [0m 2       [0m | [0m-2.717   [0m | [0m 15.92   [0m | [0m 20.51   [0m | [0m 36.09   [0m |
| [95m 3       [0m | [95m-2.688   [0m | [95m 59.09   [0m | [95m 11.31   [0m | [95m 65.41   [0m |
| [0m 4       [0m | [0m-2.711   [0m | [0m 72.22   [0m | [0m 24.64   [0m | [0m 80.11   [0m |
| [0m 5       [0m | [0m-2.693   [0m | [0m 88.84   [0m | [0m 19.55   [0m | [0m 62.83   [0m |
| [0m 6       [0m | [0m-2.729   [0m | [0m 11.14   [0m | [0m 2.375   [0m | [0m 149.3   [0m |
| [0m 7       [0m | [0m-2.704   [0m | [0m 99.85   [0m | [0m 2.073   [0m | [0m 148.1   [0m |
| [0m 8       [0m | [0m-2.837   [0m | [0m 97.81   [0m | [0m 2.188   [0m | [0m 12.58   [0m |
| [0m 9       [0m | [0m-2.748   [0m | [0m 100.0   

# XGBoost

In [136]:
# SKLearn Random Forest
xgb_params = {'n_estimators':(10, 150), 'max_depth':(10, 200), 'learning_rate':(0, 1),
              'subsample':(0, 1), 'gamma':(0, 50), 'reg_alpha':(0, 100), 'reg_lambda':(0, 100)}

def cv_mse_xgb(n_estimators, max_depth, learning_rate, subsample, gamma, reg_alpha, reg_lambda):
    """ 
    Performs cross validation for Random Forest Regressor. To be used for Bayesian optimiser maximizer function.
    
    Parameters
    ----------
    n_estimators : float
        Number of estimators
    max_depth : float
        Max depth of trees
    learning_rate : float
        Learning rate
    subsample : float
        Subsample ratio of training instances 
    gamma : float
        Min loss reduction to make further partition on leaf node   
    reg_alpha : float
        L1 regularisation
    reg_lambda : float
        L2 regularisation

    Returns
    -------
    float
        Cross validation score based on negative mean squared error.
        
    """
    # Convert chosen hyperparams to discrete integer
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    
    estimator = xgb.XGBRegressor(objective='reg:squarederror',
                                 n_estimators = n_estimators, 
                                 max_depth = max_depth, 
                                 learning_rate = learning_rate, 
                                 subsample = subsample,
                                 gamma = gamma, 
                                 reg_alpha = reg_alpha, 
                                 reg_lambda = reg_lambda)

    # Note that neg_mean_squared_error is opposite, thus a negative sign is added for Minimisation Optimisation via BayesOpt
    return cross_validate(estimator, X_train_trans, y_train, cv = 10, scoring = "neg_root_mean_squared_error")["test_score"].mean()

In [141]:
# Warnings due to some current issue with xgboost incompatibility with pandas deprecation
# Fix will be for upcoming xgboost version 1.0.0, but latest version is only 0.90
# See https://github.com/dmlc/xgboost/issues/4300
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Fixed random state due to xgboost hyper param combination throwing a result for NaN
optimizer_xgb = BayesianOptimization(cv_mse_xgb, xgb_params, random_state = 1)
optimizer_xgb.maximize(n_iter = 100)

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.909   [0m | [0m 20.85   [0m | [0m 0.7203  [0m | [0m 10.01   [0m | [0m 52.33   [0m | [0m 14.68   [0m | [0m 9.234   [0m | [0m 0.1863  [0m |
| [95m 2       [0m | [95m-2.729   [0m | [95m 17.28   [0m | [95m 0.3968  [0m | [95m 58.49   [0m | [95m 68.69   [0m | [95m 68.52   [0m | [95m 20.45   [0m | [95m 0.8781  [0m |
| [0m 3       [0m | [0m-2.961   [0m | [0m 1.369   [0m | [0m 0.6705  [0m | [0m 47.56   [0m | [0m 88.22   [0m | [0m 14.04   [0m | [0m 19.81   [0m | [0m 0.8007  [0m |
| [0m 4       [0m | [0m-4.359   [0m | [0m 48.41   [0m | [0m 0.3134  [0m | [0m 72.31   [0m | [0m 132.7   [0m | [0m 89.46   [0m | [0m 8.504   [0m | [0m 0.03905 [0m |
| [95m 5       [0m | [95m-2.728   [0m |

# LightGBM


In [217]:
round(1.23, 3)

1.23

In [226]:
# SKLearn Random Forest
lgbm_params = {'n_estimators':(10, 150), 'max_depth':(10, 200), 'learning_rate':(0.001, 1),
               'reg_alpha':(0, 100), 'reg_lambda':(0, 100)}

def cv_mse_lgbm(n_estimators, max_depth, learning_rate, reg_alpha, reg_lambda):
    """ 
    Performs cross validation for Random Forest Regressor. To be used for Bayesian optimiser maximizer function.
    
    Parameters
    ----------
    n_estimators : float
        Number of estimators
    max_depth : float
        Max depth of trees
    learning_rate : float
        Learning rate
    reg_alpha : float
        L1 regularisation
    reg_lambda : float
        L2 regularisation

    Returns
    -------
    float
        Cross validation score based on negative mean squared error.
        
    """
    # Convert chosen hyperparams to discrete integer
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    
    estimator = lgbm.LGBMRegressor(n_estimators = n_estimators, 
                                 max_depth = max_depth, 
                                 learning_rate = learning_rate, 
                                 reg_alpha = reg_alpha, 
                                 reg_lambda = reg_lambda)
    
    # Note that neg_mean_squared_error is opposite, thus a negative sign is added for Minimisation Optimisation via BayesOpt
    return cross_validate(estimator, X_train_trans, y_train, cv = 10, scoring = "neg_root_mean_squared_error")["test_score"].mean()

In [227]:
# Fixed random state due to xgboost hyper param combination throwing a result for NaN
optimizer_lgbm = BayesianOptimization(cv_mse_lgbm, lgbm_params, random_state = 1)
optimizer_lgbm.maximize(n_iter = 100)

|   iter    |  target   | learni... | max_depth | n_esti... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.727   [0m | [0m 0.4176  [0m | [0m 74.83   [0m | [0m 10.02   [0m | [0m 30.23   [0m | [0m 14.68   [0m |
| [0m 2       [0m | [0m-2.749   [0m | [0m 0.09325 [0m | [0m 26.76   [0m | [0m 58.38   [0m | [0m 39.68   [0m | [0m 53.88   [0m |
| [0m 3       [0m | [0m-2.809   [0m | [0m 0.4198  [0m | [0m 71.67   [0m | [0m 38.62   [0m | [0m 87.81   [0m | [0m 2.739   [0m |
| [0m 4       [0m | [0m-2.802   [0m | [0m 0.6708  [0m | [0m 47.56   [0m | [0m 88.22   [0m | [0m 14.04   [0m | [0m 19.81   [0m |
| [0m 5       [0m | [0m-2.79    [0m | [0m 0.8009  [0m | [0m 97.14   [0m | [0m 53.88   [0m | [0m 69.23   [0m | [0m 87.64   [0m |
| [0m 6       [0m | [0m-2.769   [0m | [0m 1.0     [0m | [0m 10.0    [0m | [0m 10.0    [0m | [0m 4.7e-11 [0

In [229]:
# Compare the best models
cv_rmse = [-optimizer_lmlasso.max['target'], 
             -optimizer_lmridge.max['target'], 
             -optimizer_rf.max['target'],
             -optimizer_xgb.max['target'],
             -optimizer_lgbm.max['target']]

models = ["lm_lasso", "lm_ridge", "randomforest", "xgb", "lgbm"]

cv_df = pd.DataFrame(cv_rmse, index = models, columns = ["cv_score"])

cv_df

Unnamed: 0,cv_score
lm_lasso,2.756544
lm_ridge,2.735042
randomforest,2.627075
xgb,2.727759
lgbm,2.726576


In [265]:
# Adjusting discrete hyperparam for certain models
rf_hyperparam = optimizer_rf.max['params']
rf_hyperparam['max_depth'] = int(rf_hyperparam['max_depth'])
rf_hyperparam['max_features'] = int(rf_hyperparam['max_features'])
rf_hyperparam['n_estimators'] = int(rf_hyperparam['n_estimators'])

xgb_hyperparam = optimizer_xgb.max['params']
xgb_hyperparam['max_depth'] = int(xgb_hyperparam['max_depth'])
xgb_hyperparam['n_estimators'] = int(xgb_hyperparam['n_estimators'])

lgbm_hyperparam = optimizer_lgbm.max['params']
lgbm_hyperparam['max_depth'] = int(lgbm_hyperparam['max_depth'])
lgbm_hyperparam['n_estimators'] = int(lgbm_hyperparam['n_estimators'])

In [236]:
# Store as Series for writing to csv
lmlasso_hyperparam = pd.Series(optimizer_lmlasso.max['params'])

lmridge_hyperparam = pd.Series(optimizer_lmridge.max['params'])

rf_hyperparam = pd.Series(rf_hyperparam)

xgb_hyperparam = pd.Series(xgb_hyperparam)

lgbm_hyperparam = pd.Series(lgbm_hyperparam)

In [237]:
# Output to csv
cv_df.to_csv("../data/output/cv_results.csv")

lmlasso_hyperparam.to_csv("../data/output/lmlasso_hyperparam.csv")
lmridge_hyperparam.to_csv("../data/output/lmridge_hyperparam.csv")
rf_hyperparam.to_csv("../data/output/rf_hyperparam.csv")
xgb_hyperparam.to_csv("../data/output/xgb_hyperparam.csv")
lgbm_hyperparam.to_csv("../data/output/lgbm_hyperparam.csv")

# Test Data

In [238]:
# Test set
test_data  = pd.read_csv("../data/processed/test.csv")

X_test = test_data.drop(["G3"], axis = 1)
y_test = test_data["G3"]

In [252]:
# Convert to dataframe
X_test_trans = pd.DataFrame(preprocessor.fit_transform(X_test),
                            index = X_test.index,
                             columns = (list(numeric_features) +
                                       list(preprocessor.named_transformers_['ohe'].get_feature_names(categorical_features))))

In [260]:
best_lasso = Lasso().set_params(**lmlasso_hyperparam)
best_lasso.fit(X_train_trans, y_train)

Lasso(alpha=0.0039689085736815954, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [261]:
best_ridge = Ridge().set_params(**lmridge_hyperparam)
best_ridge.fit(X_train_trans, y_train)

Ridge(alpha=61.716008089193764, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [262]:
best_rf = RandomForestRegressor().set_params(**rf_hyperparam)
best_rf.fit(X_train_trans, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=63, max_features=7, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=114, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [269]:
best_xgb = XGBRegressor().set_params(**xgb_hyperparam)
best_xgb.fit(X_train_trans, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=8.491520978228445,
             importance_type='gain', learning_rate=0.8781425034294131,
             max_delta_step=0, max_depth=18, min_child_weight=1, missing=None,
             n_estimators=68, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=95.7889530150502,
             reg_lambda=53.316528497301704, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.6918771139504734, verbosity=1)

In [270]:
best_lgbm = lgbm.LGBMRegressor().set_params(**lgbm_hyperparam)
best_lgbm.fit(X_train_trans, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.41760498269787144,
              max_depth=74, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=10, n_jobs=-1, num_leaves=31,
              objective=None, random_state=None, reg_alpha=30.233257263183976,
              reg_lambda=14.675589081711305, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

# Test Data Prediction

In [280]:
test_rmse = []
test_rmse.append(np.sqrt(mean_squared_error(y_test, best_lasso.predict(X_test_trans))))
test_rmse.append(np.sqrt(mean_squared_error(y_test, best_ridge.predict(X_test_trans))))
test_rmse.append(np.sqrt(mean_squared_error(y_test, best_rf.predict(X_test_trans))))
test_rmse.append(np.sqrt(mean_squared_error(y_test, best_xgb.predict(X_test_trans))))
test_rmse.append(np.sqrt(mean_squared_error(y_test, best_lgbm.predict(X_test_trans))))

# Convert to Dataframe
test_rmse = pd.DataFrame(test_rmse, index= models, columns = ["test_rmse"])

In [281]:
test_rmse

Unnamed: 0,test_rmse
lm_lasso,2.483822
lm_ridge,2.481183
randomforest,2.458307
xgb,2.510746
lgbm,2.57754


In [282]:
np.mean(y_test)

12.062015503875969

In [296]:
# Best model
print(test_rmse.idxmin())

test_rmse    randomforest
dtype: object


In [306]:
feat_importance = pd.DataFrame(best_rf.feature_importances_, index = X_train_trans.columns, columns = ["Importance"])
feat_importance = feat_importance.sort_values(by = "Importance", ascending = False).reset_index()

In [307]:
feat_importance

Unnamed: 0,index,Importance
0,failures,0.097314
1,absences,0.055374
2,school_MS,0.051027
3,age,0.049253
4,goout,0.047025
5,Walc,0.046958
6,Medu,0.046369
7,higher_yes,0.045723
8,freetime,0.0386
9,studytime,0.038511


In [314]:
# https://github.com/nipunbatra/50-ggplot-python/blob/master/Altair/DivergingLollipop.ipynb
import altair as alt
from altair import *
c1 = alt.Chart(feat_importance).mark_bar(color='black').encode(
    y=alt.Y('index', sort=SortField(op='mean', order='descending', field='Importance')),
    x=alt.X('Importance' )
)

SchemaValidationError: Invalid specification

        altair.vegalite.v3.schema.core.SortField, validating 'additionalProperties'

        Additional properties are not allowed ('op' was unexpected)
        

In [None]:
c2 = Chart(mtcars).mark_circle(color='black', size=400).encode(y=Y('model', sort=SortField(op='mean', order='descending', field='mpg_z')),
                                x=X('mpg_z' ), text='mpg_z').transform_data(
   calculate=[Formula('Performance', expr.where(expr.df.mpg_z < mean_mpg_z,'Below average','Above average'))],
   
).configure_cell(height=100, width=400).configure_scale(bandSize=22)