In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/concrete_data.csv')
df.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
from sklearn.model_selection import train_test_split
X = df.drop(columns='csMPa')
y = df.csMPa
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((824, 8), (206, 8), (824,), (206,))

In [4]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

ada_reg = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=4),
    n_estimators=100,
    learning_rate=1.0,
)
ada_reg.fit(X_train, y_train)
print(f'Train Score: {ada_reg.score(X_train, y_train):.2f}')
print(f'Test Score: {ada_reg.score(X_test, y_test):.2f}')

Train Score: 0.87
Test Score: 0.83


In [5]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

ada_reg = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=2),
    n_estimators=200,
    learning_rate=0.5,
)
ada_reg.fit(X_train, y_train)
print(f'Train Score: {ada_reg.score(X_train, y_train):.2f}')
print(f'Test Score: {ada_reg.score(X_test, y_test):.2f}')

Train Score: 0.74
Test Score: 0.72


In [7]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(
    max_depth=4,
    n_estimators=100,
    learning_rate=0.1,
)
gbr.fit(X_train, y_train)
print(f'Train Score: {gbr.score(X_train, y_train):.2f}')
print(f'Test Score: {gbr.score(X_test, y_test):.2f}')

Train Score: 0.97
Test Score: 0.92


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

gbr = GradientBoostingRegressor(max_depth=3)
params = dict(n_estimators=[50, 100, 200, 500, 1000], learning_rate=[0.1, 0.5, 1.0])
grid = GridSearchCV(gbr, params, cv=3, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print(f'Train Score: {r2_score(y_train, grid.predict(X_train)):.2f}')
print(f'Test Score: {r2_score(y_test, y_pred):.2f}')

Train Score: 0.99
Test Score: 0.94


In [11]:
print(grid.best_params_)

{'learning_rate': 0.1, 'n_estimators': 1000}


In [12]:
# Warm Start and Early Stopping
gbr = GradientBoostingRegressor(max_depth=3, warm_start=True)

min_val_error = float('inf')
error_going_up = 0

for n_estimators in range(1, 1000):
    gbr.n_estimators = n_estimators
    gbr.fit(X_train, y_train)
    
    y_pred = gbr.predict(X_test)
    val_error = mean_squared_error(y_test, y_pred)

    print(f'n_estimators: {n_estimators}, val_error: {val_error:.4f}')

    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 10: # early stopping
            break

n_estimators: 1, val_error: 232.4961
n_estimators: 2, val_error: 206.6208
n_estimators: 3, val_error: 182.4964
n_estimators: 4, val_error: 162.1774
n_estimators: 5, val_error: 146.6245
n_estimators: 6, val_error: 132.8427
n_estimators: 7, val_error: 121.5462
n_estimators: 8, val_error: 113.6059
n_estimators: 9, val_error: 105.3829
n_estimators: 10, val_error: 98.9242
n_estimators: 11, val_error: 91.7530
n_estimators: 12, val_error: 85.1065
n_estimators: 13, val_error: 79.9572
n_estimators: 14, val_error: 75.8880
n_estimators: 15, val_error: 71.8108
n_estimators: 16, val_error: 68.6884
n_estimators: 17, val_error: 65.0228
n_estimators: 18, val_error: 62.4262
n_estimators: 19, val_error: 60.0161
n_estimators: 20, val_error: 57.3311
n_estimators: 21, val_error: 55.1957
n_estimators: 22, val_error: 53.2125
n_estimators: 23, val_error: 51.6198
n_estimators: 24, val_error: 49.7197
n_estimators: 25, val_error: 48.3067
n_estimators: 26, val_error: 46.9188
n_estimators: 27, val_error: 45.5018
n

In [13]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=n_estimators)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
print(f'Train Score: {gbr.score(X_train, y_train):.2f}')
print(f'Test Score: {gbr.score(X_test, y_test):.2f}')

Train Score: 0.98
Test Score: 0.93
