In [1]:
import pandas as pd
import numpy as np
import time

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import scikit-learn for machine learning
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

# K-fold cross validation
from sklearn.model_selection import GridSearchCV

# target variable transformation
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer

# accuracy metrics for the regression problem
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# to save the models
import pickle

# to remove unnecessary warnings
import warnings

warnings.filterwarnings('ignore')

In [2]:
def cv_result(model):
    """
    This function returns the grid search and cross-validation results in a DataFrame
    Args:
        model: trained model
    Returns:
        a datafrmae
    """

    # create a new dataframe from the grid search cross-validation results
    cross_val_result = pd.DataFrame(model.cv_results_)
    
    # list of columns to drop
    drop_cols = ['mean_fit_time', 'std_fit_time', 'mean_score_time', 
                 'std_score_time', 'params', 'rank_test_score']

    cross_val_result = cross_val_result.drop(drop_cols, axis=1)
    
    # return a dataframe
    return cross_val_result

In [3]:
def GBRegressor_with_target_transformation(x, y, n_estimators, max_depth, learning_rate, kpi, transformer):
    """
    This function performs grid search, cross-validation, and transform the target variable
    Args:
        x (dataFrame): X_train
        y (dataFrame): y_train
        n_estimators (List[int]): The number of boosting stages to perform
        max_depth (List[int]): Maximum depth of the individual regression estimators 
        learning_rate (List[float]): Learning rate shrinks the contribution of each tree
        kpi (str): 'neg_mean_absolute_error', 'neg_root_mean_squared_error'
        transformer (function): PowerTransformer(method='box-cox')
    Returns:
        grid_result: trained model
    """
    regressor = GradientBoostingRegressor(random_state=42)
    model = TransformedTargetRegressor(regressor=regressor, transformer = transformer)
    pipe = Pipeline([('TargetTransformed', model)])

    # Hyperparameters
    parameters = {'TargetTransformed__regressor__n_estimators': n_estimators,
                 'TargetTransformed__regressor__max_depth': max_depth,
                 'TargetTransformed__regressor__learning_rate': learning_rate}

    grid_search = GridSearchCV(estimator=pipe, param_grid=parameters, scoring=kpi, cv=5, n_jobs=-1)

    # fit the model with the best hyperparameters
    grid_result = grid_search.fit(x, y)

    # return the model to proceed into prediction
    return grid_result

In [4]:
def error_comparison_with_target_transformation(X_train, y_train, X_test, y_test, max_depth, kpi, transformer):
    """
    This function return the training error, testing error, and cross-validation results
    Args:
        X_train (dataFrame): training features
        y_train (dataFrame): target train variable
        X_test (dataFrame): test features
        y_test (dataFrame): target test variable
        max_depth (List[int]): Maximum depth of the individual regression estimators
        kpi (str): 'neg_mean_absolute_error', 'neg_root_mean_squared_error'
        transformer (function): PowerTransformer(method='box-cox')
    Returns:
        train_kpi (List[float]): training error
        test_kpi (List[float]): testing error
        cv_df (dataFrame): cross-validation results
    """
    start = time.time()

    train_kpi = []
    test_kpi = []
    cv_df = pd.DataFrame()

    for max_depth_ in max_depth:
        model = GBRegressor_with_target_transformation(X_train, y_train, [100], [max_depth_], [0.1], kpi, transformer)

        train_error = np.round(np.abs(model.best_score_), 2)
        train_kpi.append(train_error)
        
        cv_results = cv_result(model)
        cv_df = pd.concat([cv_df, cv_results])
        
        y_pred = model.predict(X_test)
        if kpi == 'neg_mean_absolute_error':
            test_error = np.round(mean_absolute_error(y_test, y_pred),2)
        if kpi == 'neg_root_mean_squared_error':
            test_error = np.round(np.sqrt(mean_squared_error(y_test, y_pred)),2)
        test_kpi.append(test_error)

    end = time.time()
    print("time = ", (end-start)/60)
    
    return train_kpi, test_kpi, cv_df

In [5]:
def GBRegressor_without_target_transformation(x, y, n_estimators, max_depth, learning_rate, kpi):
    """
    This function performs grid search, cross-validation, and transform the target variable
    Args:
        x (dataFrame): X_train
        y (dataFrame): y_train
        n_estimators (List[int]): The number of boosting stages to perform
        max_depth (List[int]): Maximum depth of the individual regression estimators 
        learning_rate (List[float]): Learning rate shrinks the contribution of each tree
        kpi (str): 'neg_mean_absolute_error', 'neg_root_mean_squared_error'
    Returns:
        grid_result: trained model
    """
    regressor = GradientBoostingRegressor(random_state=42)
    pipe = Pipeline([('regressor', regressor)])

    # Hyperparameters
    parameters = {'regressor__n_estimators': n_estimators,
                 'regressor__max_depth': max_depth,
                 'regressor__learning_rate': learning_rate}

    grid_search = GridSearchCV(estimator=pipe, param_grid=parameters, scoring=kpi, cv=5, n_jobs=-1)

    # fit the model with the best hyperparameters
    grid_result = grid_search.fit(x, y)

    # return the model to proceed into prediction
    return grid_result

In [6]:
def error_comparison_without_target_transformation(X_train, y_train, X_test, y_test, max_depth, kpi):
    """
    This function return the training error, testing error, and cross-validation results
    Args:
        X_train (dataFrame): training features
        y_train (dataFrame): target train variable
        X_test (dataFrame): test features
        y_test (dataFrame): target test variable
        max_depth (List[int]): Maximum depth of the individual regression estimators
        kpi (str): 'neg_mean_absolute_error', 'neg_root_mean_squared_error'
    Returns:
        train_kpi (List[float]): training error
        test_kpi (List[float]): testing error
        cv_df (dataFrame): cross-validation results
    """
    start = time.time()

    train_kpi = []
    test_kpi = []
    cv_df = pd.DataFrame()

    for max_depth_ in max_depth:
        model = GBRegressor_without_target_transformation(X_train, y_train, [100], [max_depth_], [0.1], kpi)

        train_error = np.round(np.abs(model.best_score_), 2)
        train_kpi.append(train_error)
        
        cv_results = cv_result(model)
        cv_df = pd.concat([cv_df, cv_results])
        
        y_pred = model.predict(X_test)
        if kpi == 'neg_mean_absolute_error':
            test_error = np.round(mean_absolute_error(y_test, y_pred),2)
        else:
            test_error = np.round(np.sqrt(mean_squared_error(y_test, y_pred)),2)
        test_kpi.append(test_error)

    end = time.time()
    print("time = ", (end-start)/60)
    
    return train_kpi, test_kpi, cv_df