In [5]:
# required libraries
# import scikit-learn for machine learning
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet

# GridSearchCV: to find the best hyperparameters
from sklearn.model_selection import GridSearchCV

# import scikit-learn for machine learning
from sklearn.svm import SVR

# packages for data standardization
from sklearn.preprocessing import StandardScaler

# to remove unnecessary warnings
import warnings

warnings.filterwarnings('ignore')

import time

In [None]:
# Linear regression
# Help: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

def linear_regressor(x, y, fit_intercept, normalize, accuracy_metric, cv=5):
    """ This function performs Linear regression
    Args:
        x (DataFrame): training feature variables
        y (DataFrame): training target variable
        fit_intercept (bool): Whether to calculate the intercept for this model
        normalize (bool): the regressors will be normalized before regression by subtracting the mean and dividing by the l2-norm
        accuracy_metric: accuracy metric to compare the cross-validation splits
        cv (int): value of k in k-fold cross-validation
    Returns:
        grid_result: after grid search with hyper-parameter tuning and cross-validation
    """
    
    start = time.time()

    # instantiate the Linear Regression model
    # random state (int): Controls the randomness of the estimator for reproducibility
    model = LinearRegression()

    # Hyperparameters
    parameters = {'fit_intercept': fit_intercept,
                  'normalize': normalize}

    # GridSearchCV: to find the best hyperparameters based on the scoring method
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring=accuracy_metric, cv=cv)

    # fit the model with the best hyperparameters
    grid_result = grid_search.fit(x, y)
    
    end = time.time()
    print("time = ", (end-start)/60)

    # return the model to proceed into prediction
    return grid_result

In [None]:
# ElasticNet
# Help: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html

def elasticnet_regressor(x, y, alpha, l1_ratio, fit_intercept, normalize, accuracy_metric, cv=5):
    """ This function performs Linear regression
    Args:
        x (DataFrame): training feature variables
        y (DataFrame): training target variable
        alpha (float): 
        l1_ratio (float):
        fit_intercept (bool): Whether to calculate the intercept for this model
        normalize (bool): the regressors will be normalized before regression by subtracting the mean and dividing by the l2-norm
        accuracy_metric: accuracy metric to compare the cross-validation splits
        cv (int): value of k in k-fold cross-validation
    Returns:
        grid_result: after grid search with hyper-parameter tuning and cross-validation
    """
    
    start = time.time()

    # instantiate the Linear Regression model
    # random state (int): Controls the randomness of the estimator for reproducibility
    model = ElasticNet(random_state=42)

    # Hyperparameters
    parameters = {'alpha':alpha,
                  'l1_ratio':l1_ratio,
                  'fit_intercept': fit_intercept,
                  'normalize': normalize}

    # GridSearchCV: to find the best hyperparameters based on the scoring method
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring=accuracy_metric, cv=cv)

    # fit the model with the best hyperparameters
    grid_result = grid_search.fit(x, y)
    
    end = time.time()
    print("time = ", (end-start)/60)

    # return the model to proceed into prediction
    return grid_result

In [None]:
# AdaBoost Regressor
# Help: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html

def adaboost_regressor(x, y, n_estimators, learning_rate, loss, accuracy_metric, cv=5):
    """ This function performs AdaBoost regression
    Args:
        x (DataFrame): training feature variables
        y (DataFrame): training target variable
        n_estimators (int): The maximum number of estimators at which boosting is terminated
        learning_rate (float): Learning rate shrinks the contribution of each regressor
        loss (str): The loss function to use when updating the weights after each boosting iteration
        accuracy_metric: accuracy metric to compare the cross-validation splits
        cv (int): value of k in k-fold cross-validation
    Returns:
        grid_result: after grid search with hyper-parameter tuning and cross-validation
    """
    
    start = time.time()

    # instantiate the Decision Tree regressor model
    # random state (int): Controls the randomness of the estimator for reproducibility
    model = AdaBoostRegressor(random_state=42)

    # Hyperparameters
    parameters = {'n_estimators': n_estimators,
                  'learning_rate': learning_rate,
                  'loss': loss}

    # GridSearchCV: to find the best hyperparameters based on the scoring method
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring=accuracy_metric, cv=cv)

    # fit the model with the best hyperparameters
    grid_result = grid_search.fit(x, y)
    
    end = time.time()
    print("time = ", (end-start)/60)

    # return the model to proceed into prediction
    return grid_result

In [2]:
# Decision Tree Regressor
# Help: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

def decision_tree_regressor(x, y, criterion, max_features, max_depth, min_samples_leaf, accuracy_metric, cv=5):
    """ This function performs Decision Tree regression
    Args:
        x (DataFrame): training feature variables
        y (DataFrame): training target variable
        criterion (str): The function to measure the quality of a split
        max_features (str): The number of features to consider when looking for the best split
        max_depth (int): The maximum depth of the tree
        min_samples_leaf (int): The minimum number of samples required to be at a leaf node
        accuracy_metric: accuracy metric to compare the cross-validation splits
        cv (int): value of k in k-fold cross-validation
    Returns:
        grid_result: after grid search with hyper-parameter tuning and cross-validation
    """
    
    start = time.time()

    # instantiate the Decision Tree regressor model
    # random state (int): Controls the randomness of the estimator for reproducibility
    model = DecisionTreeRegressor(random_state=42)

    # Hyperparameters
    parameters = {'criterion': criterion,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'min_samples_leaf': min_samples_leaf}

    # GridSearchCV: to find the best hyperparameters based on the scoring method
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring=accuracy_metric, cv=cv)

    # fit the model with the best hyperparameters
    grid_result = grid_search.fit(x, y)
    
    end = time.time()
    print("time = ", (end-start)/60)

    # return the model to proceed into prediction
    return grid_result

In [3]:
def random_forest_regressor(x, y, criterion, n_estimators, bootstrap, max_features,
                            max_depth, min_samples_leaf, accuracy_metric, cv=5):
    """ This function performs Random Forest regression
    Args:
        x (DataFrame): training feature variables
        y (DataFrame): training target variable
        criterion : The function to measure the quality of a split
        n_estimators (int): The number of trees in the forest
        bootstrap (bool): Whether bootstrap samples are used when building trees.
        max_features: The number of features to consider when looking for the best split
        max_depth (int): The maximum depth of the tree
        min_samples_leaf (int): The minimum number of samples required to split an internal node
        accuracy_metric: accuracy metric to compare the cross-validation splits
        cv (int): value of k in k-fold cross-validation
    Returns:
        grid_result: after grid search with hyper-parameter tuning and cross-validation
    """

    start = time.time()
    
    # instantiate the Random Forest regressor model
    # random state (int): Controls the randomness of the estimator for reproducibility
    model = RandomForestRegressor(random_state=42)

    # Hyperparameters
    parameters = {'criterion': criterion,
                  'n_estimators': n_estimators,
                  'bootstrap': bootstrap,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'min_samples_leaf': min_samples_leaf}

    # GridSearchCV: to find the best hyperparameters
    # based on the scoring method
    # optional: 'neg_root_mean_squared_error'
    # cv : cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring=accuracy_metric, cv=cv)

    # fit the model with the best hyperparameters
    grid_result = grid_search.fit(x, y)
    
    end = time.time()
    print("time = ", (end-start)/60)

    # return the model to proceed into prediction
    return grid_result

In [4]:
def gradient_boosting_regressor(x, y, criterion, max_depth, n_estimators, learning_rate, accuracy_metric, cv=5):
    """ This function performs Gradient Boosting regression
    Args:
        x (DataFrame) : training feature variables
        y (DataFrame) : training target variable
        criterion: {'friedman_mse', mae', 'mse'}, The function is to measure the quality of a split
        n_estimators (int) : The number of boosting stages to perform
        max_depth (int) : Maximum depth of the individual regression estimators
        learning rate (float) : It shrinks the contribution of each tree by learning_rate
        accuracy_metric: accuracy metric to compare the cross-validation splits
        cv (int): value of k in k-fold cross-validation
    Returns:
        grid_result: after grid search with hyper-parameter tuning and cross-validation
    """

    start = time.time()
    
    # instantiate the Gradient Boosting regressor model
    # random state (int): Controls the randomness of the estimator for reproducibility
    model = GradientBoostingRegressor(random_state=42)

    # Hyperparameters
    parameters = {'criterion': criterion,
                  'max_depth': max_depth,
                  'n_estimators': n_estimators,
                  'learning_rate': learning_rate}

    # GridSearchCV: to find the best hyperparameters based on the scoring method
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring=accuracy_metric, cv=cv)

    # fit the model with the best hyperparameters
    grid_result = grid_search.fit(x, y)
    
    end = time.time()
    print("time = ", (end-start)/60)

    # return the model to proceed into prediction
    return grid_result