# Choosing model hyperparameters

In [33]:
# import data reader
import pandas as pd

# import models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import    RandomForestRegressor,\
                                AdaBoostRegressor,\
                                GradientBoostingRegressor

# generating random numbers
from scipy import stats

# hyperparameter tuning functions
from sklearn.model_selection import RandomizedSearchCV, \
                                    cross_val_score, \
                                    cross_val_predict

# serialization
import joblib

Linear regression

In [9]:
help(LinearRegression())

Help on LinearRegression in module sklearn.linear_model._base object:

class LinearRegression(sklearn.base.MultiOutputMixin, sklearn.base.RegressorMixin, LinearModel)
 |  LinearRegression(*, fit_intercept=True, copy_X=True, n_jobs=None, positive=False)
 |  
 |  Ordinary least squares Linear Regression.
 |  
 |  LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
 |  to minimize the residual sum of squares between the observed targets in
 |  the dataset, and the targets predicted by the linear approximation.
 |  
 |  Parameters
 |  ----------
 |  fit_intercept : bool, default=True
 |      Whether to calculate the intercept for this model. If set
 |      to False, no intercept will be used in calculations
 |      (i.e. data is expected to be centered).
 |  
 |  copy_X : bool, default=True
 |      If True, X will be copied; else, it may be overwritten.
 |  
 |  n_jobs : int, default=None
 |      The number of jobs to use for the computation. This will only provide
 | 

Decision Tree

In [10]:
help(DecisionTreeRegressor())

Help on DecisionTreeRegressor in module sklearn.tree._classes object:

class DecisionTreeRegressor(sklearn.base.RegressorMixin, BaseDecisionTree)
 |  DecisionTreeRegressor(*, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, monotonic_cst=None)
 |  
 |  A decision tree regressor.
 |  
 |  Read more in the :ref:`User Guide <tree>`.
 |  
 |  Parameters
 |  ----------
 |  criterion : {"squared_error", "friedman_mse", "absolute_error",             "poisson"}, default="squared_error"
 |      The function to measure the quality of a split. Supported criteria
 |      are "squared_error" for the mean squared error, which is equal to
 |      variance reduction as feature selection criterion and minimizes the L2
 |      loss using the mean of each terminal node, "friedman_mse", which uses
 |      mean squared error w

Random Forest

In [11]:
help(RandomForestRegressor())

Help on RandomForestRegressor in module sklearn.ensemble._forest object:

class RandomForestRegressor(ForestRegressor)
 |  RandomForestRegressor(n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None, monotonic_cst=None)
 |  
 |  A random forest regressor.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  regressors on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  Trees in the forest use the best split strategy, i.e. equivalent to passing
 |  `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeRegressor`.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstra

AdaBoost Regressor

In [12]:
help(AdaBoostRegressor())

Help on AdaBoostRegressor in module sklearn.ensemble._weight_boosting object:

class AdaBoostRegressor(sklearn.utils._metadata_requests._RoutingNotSupportedMixin, sklearn.base.RegressorMixin, BaseWeightBoosting)
 |  AdaBoostRegressor(estimator=None, *, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None)
 |  
 |  An AdaBoost regressor.
 |  
 |  An AdaBoost [1] regressor is a meta-estimator that begins by fitting a
 |  regressor on the original dataset and then fits additional copies of the
 |  regressor on the same dataset but where the weights of instances are
 |  adjusted according to the error of the current prediction. As such,
 |  subsequent regressors focus more on difficult cases.
 |  
 |  This class implements the algorithm known as AdaBoost.R2 [2].
 |  
 |  Read more in the :ref:`User Guide <adaboost>`.
 |  
 |  .. versionadded:: 0.14
 |  
 |  Parameters
 |  ----------
 |  estimator : object, default=None
 |      The base estimator from which the boosted ensem

GradientBoost Regressor

In [13]:
help(GradientBoostingRegressor)

Help on class GradientBoostingRegressor in module sklearn.ensemble._gb:

class GradientBoostingRegressor(sklearn.base.RegressorMixin, BaseGradientBoosting)
 |  GradientBoostingRegressor(*, loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
 |  
 |  Gradient Boosting for regression.
 |  
 |  This estimator builds an additive model in a forward stage-wise fashion; it
 |  allows for the optimization of arbitrary differentiable loss functions. In
 |  each stage a regression tree is fit on the negative gradient of the given
 |  loss function.
 |  
 |  :class:`~sklearn.ensemble.HistGradientBoostingRegressor` is a much faster variant
 |  of this algorith

Hyperparameter collections

In [14]:
estimators_configurations = {
    "Linear Regression": {
        "estimator": LinearRegression(),
        "params": {
            "fit_intercept": [True, False]
            }
        },

    "Decision Tree": {
        'estimator': DecisionTreeRegressor(),
        'params': {
            'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'splitter': ['best','random'],
            'max_features': ['sqrt','log2', None],
            'max_depth': [None, 1, 3, 5, 7]
            }
    },
    
    "Random Forest": {
        'estimator': RandomForestRegressor(),
        'params': {
            'n_estimators': stats.randint(10, 100,),
            'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'max_features': ['sqrt', 'log2', None],
            'max_depth': [None, 1, 3, 5, 7],
            "bootstrap": [True, False],
            "max_samples": [0.7, 0.8, 0.9, 1.0,]
        }
    },

    "Gradient Boosting": {
        'estimator': GradientBoostingRegressor(),
        'params': {
            'n_estimators': stats.randint(10, 100,),
            'loss': ['squared_error', 'huber', 'absolute_error', 'quantile'],
            'learning_rate': [0.01, 0.05, 0.1, 0.5, 1., 10],
            'subsample': [0.7, 0.8, 0.9, 1.0,],
            'criterion': ['squared_error', 'friedman_mse'],
            'max_features': ['auto','sqrt','log2'],
            'max_depth': [None, 1, 3, 5, 7]
        }
    },

    "AdaBoost Regressor": {
        'estimator': AdaBoostRegressor(),
        'params': {
            'learning_rate': [0.01, 0.05, 0.1, 0.5, 1., 10],
            'loss': ['linear', 'square', 'exponential'],
            'n_estimators': stats.randint(10, 100,)
        }
    },
    }

Iterate over the configurations

In [15]:
for model_name, estimator_config in estimators_configurations.items():
    print("Model name:", model_name)
    print("Model object:", estimator_config["estimator"])
    print("Hyperparameter space:", estimator_config["params"], end="\n\n")

Model name: Linear Regression
Model object: LinearRegression()
Hyperparameter space: {'fit_intercept': [True, False]}

Model name: Decision Tree
Model object: DecisionTreeRegressor()
Hyperparameter space: {'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'], 'splitter': ['best', 'random'], 'max_features': ['sqrt', 'log2', None], 'max_depth': [None, 1, 3, 5, 7]}

Model name: Random Forest
Model object: RandomForestRegressor()
Hyperparameter space: {'n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x0000017623B602B0>, 'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'], 'max_features': ['sqrt', 'log2', None], 'max_depth': [None, 1, 3, 5, 7], 'bootstrap': [True, False], 'max_samples': [0.7, 0.8, 0.9, 1.0]}

Model name: Gradient Boosting
Model object: GradientBoostingRegressor()
Hyperparameter space: {'n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000176239EFA90>, 'loss': ['s

Tune models

In [30]:
# load data
df = pd.read_csv('data/stud.csv')

X = df.drop(columns=['math_score'],axis=1)

y = df['math_score']

X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [31]:
# Data preprocessing
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

X = preprocessor.fit_transform(X)
X.shape

(1000, 19)

In [32]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [None]:
help(RandomizedSearchCV)

Help on class RandomizedSearchCV in module sklearn.model_selection._search:

class RandomizedSearchCV(BaseSearchCV)
 |  RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan, return_train_score=False)
 |  
 |  Randomized search on hyper parameters.
 |  
 |  RandomizedSearchCV implements a "fit" and a "score" method.
 |  It also implements "score_samples", "predict", "predict_proba",
 |  "decision_function", "transform" and "inverse_transform" if they are
 |  implemented in the estimator used.
 |  
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated search over parameter settings.
 |  
 |  In contrast to GridSearchCV, not all parameter values are tried out, but
 |  rather a fixed number of parameter settings is sampled from the specified
 |  distributions. The number of parameter settings that are tried is
 |  

In [34]:
# perform hyperparameter search
# store results
model_names = []
validation_scores = []
test_scores = []

# tuning model
import warnings
warnings.filterwarnings("ignore")

for i, (estimator_name, estimator_config ) in enumerate(estimators_configurations.items()):
    random_search = RandomizedSearchCV(
        estimator=estimator_config["estimator"],
        param_distributions=estimator_config["params"],
        n_iter=30,
        cv=5,
        random_state=42
        )
    
    random_search.fit(Xtrain_transformed, ytrain)
    
    val_score = random_search.best_score_

    test_score = random_search.score(Xtest_transformed, ytest)

    # append necessary details
    model_names.append(estimator_name)
    validation_scores.append(val_score)
    test_scores.append(test_score)
    

# collect results as dataframe
results = pd.DataFrame(data={"validation_scores": validation_scores, "test_scores": test_scores},
                        index=model_names)


In [35]:
results.head()

Unnamed: 0,validation_scores,test_scores
Linear Regression,0.868589,0.880433
Decision Tree,0.781548,0.79077
Random Forest,0.83837,0.860027
Gradient Boosting,0.845395,0.857815
AdaBoost Regressor,0.828028,0.850557
