In [2]:
import pandas as pd

In [3]:
with open('vr_demo.csv', encoding='utf-8') as f:
    df = pd.read_csv(f, skipinitialspace=True, encoding='utf-8')
df

Unnamed: 0,Base MD.Sample,Lat.Any,Long.Any,VR Mean.Any
0,1740.0,47.122103,-47.957358,0.33
1,1920.0,47.122103,-47.957358,0.39
2,2730.0,47.122103,-47.957358,0.42
3,3990.0,47.122103,-47.957358,0.49
4,4410.0,47.122103,-47.957358,0.59
...,...,...,...,...
283,2460.0,46.458936,-48.282383,0.45
284,2560.0,46.458936,-48.282383,0.47
285,2740.0,46.458936,-48.282383,0.49
286,3580.0,46.458936,-48.282383,0.67


In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
import numpy as np

features = df.columns.values[:-1]
targets = df.columns.values[-1]
seed = 42

X, y = df[features], df[targets]
X_trn, X_test, y_trn, y_test = train_test_split(X, y, train_size=0.7, random_state=seed)

regressor = GradientBoostingRegressor(random_state=seed)

def build_and_eval_pipeline(regressor):
    pipeline = Pipeline(steps=[
        ('Scale', StandardScaler()),
        ('Impute', KNNImputer()),
        ('Regression', regressor)])

    pipeline.fit(X_trn, y_trn)
    y_pred = pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"R²: {r2:.3f}, RMSE: {rmse:.2f}")
    
build_and_eval_pipeline(regressor)

R²: 0.665, RMSE: 0.17


In [53]:
# try with grid search
from sklearn.model_selection import GridSearchCV

n_estimators = [int(x) for x in np.logspace(0.5, 3, 15)]
print(f"n_esimators param space: {n_estimators}")

max_depth = [int(x) for x in np.logspace(0.32, 1.6, 6)]
print(f"max_depth param space: {max_depth}")

param_grid = {
    #"loss": ['squared_error', 'absolute_error', 'huber', 'quantile'],
    "learning_rate": [1e-1, 0.25],
    "n_estimators": n_estimators,
    "max_depth": max_depth}

regressorSearch = GridSearchCV(regressor, param_grid)
build_and_eval_pipeline(regressorSearch)

regressorSearch.best_params_

n_esimators param space: [3, 4, 7, 10, 16, 24, 37, 56, 84, 127, 193, 291, 439, 662, 1000]
max_depth param space: [2, 3, 6, 12, 22, 39]
R²: 0.677, RMSE: 0.16


{'learning_rate': 0.25, 'max_depth': 3, 'n_estimators': 16}

In [54]:
pd.DataFrame(regressorSearch.cv_results_).sort_values("rank_test_score")[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
109,0.003594,0.000486,0.000205,0.00041,0.25,3,16,"{'learning_rate': 0.25, 'max_depth': 3, 'n_est...",0.766402,0.790987,0.72486,0.784164,0.838418,0.780966,0.0368,1
110,0.0046,0.000494,0.0004,0.00049,0.25,3,24,"{'learning_rate': 0.25, 'max_depth': 3, 'n_est...",0.752218,0.798391,0.725335,0.782121,0.79678,0.770969,0.028201,2
108,0.002398,0.000492,0.000603,0.000492,0.25,3,10,"{'learning_rate': 0.25, 'max_depth': 3, 'n_est...",0.755015,0.787811,0.682877,0.824573,0.795074,0.76907,0.048445,3
9,0.021392,0.000481,0.000601,0.00049,0.1,2,127,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",0.803968,0.810382,0.659277,0.714651,0.784492,0.754554,0.058523,4
111,0.007399,0.00049,0.000399,0.000489,0.25,3,37,"{'learning_rate': 0.25, 'max_depth': 3, 'n_est...",0.734152,0.798851,0.735129,0.745759,0.749871,0.752752,0.023828,5


In [31]:
#pd.DataFrame(regressorSearch.cv_results_).to_excel('gbr_grid_search.xlsx')