In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor

df = pd.read_pickle('combined v2.pkl')

In [2]:
# Define X_train and X_test
features = ['next_order', 'next_park_factor_SLG', 'next_park_factor_OBP', 'S_avg', 'SB_avg', 
                    'BB_avg', 'HR_avg', 'CS_avg', 'pts_1000', 'ratio_100', 'ratio_250', 
                    'ratio_500','park_ratio_SLG','park_ratio_OBP', 'pts_500_p','whip_SO_p'
                    ,'ratio_2000'] + ['park_ratio_SLG_p','park_ratio_OBP_p']

In [3]:
X_train = df[features][(df['year'] < 2017) & (df['year'] >= 2008)]
X_test = df[features][df['year'] >= 2017]

# Define y_train and y_test
y_train = df['pts'][(df['year'] < 2017) & (df['year'] >= 2008)]
y_test = df['pts'][df['year'] >= 2017]

In [4]:
# Create an instance
GBR = GradientBoostingRegressor()

# Fit the model (subset of data)
GBR.fit(X_train, y_train)

# Predict y_pred
y_pred = GBR.predict(X_test)

In [5]:
# Create predict
pred = pd.DataFrame({'pred': y_pred, 'actual': y_test, 'year': df['year'][df['year'] >= 2017]})

# Calculate residuals, MAE, and RMSE 
pred['residuals'] = pred['actual'] - pred['pred']
pred['MAE'] = pred['residuals'].abs()
pred['RMSE'] = pred['residuals']**2

# Calculate correlation, group it by year
cor = pred[['actual','pred','year']].groupby('year').corr().reset_index()
cor = cor[cor['level_1'] == 'actual'][['year','pred']]

# Add year column
cor.set_index('year', inplace = True)

# Add correlation
cor.columns = ['correlation']

# Put all metrics together per year and total
cor.loc['Total','correlation'] =  pred[['actual','pred']].corr().iloc[0, 1]
err = pred[['year','MAE','RMSE']].groupby('year').mean()
err['RMSE'] = err['RMSE']**0.5
err.loc['Total', 'MAE'] = pred['residuals'].abs().mean()
err.loc['Total', 'RMSE'] = pred['RMSE'].mean()**0.5
cor = cor.join(err)

# Print result
print(cor)

       correlation       MAE       RMSE
year                                   
2017      0.162100  7.526502   9.888406
2018      0.174781  7.318687   9.515556
2019      0.180639  7.699305  10.088049
Total     0.172858  7.514210   9.832668


In [6]:
feat = pd.DataFrame({'feat':features,'imp':GBR.feature_importances_})
feat.sort_values('imp', ascending=False, inplace=True)
print('\n\n', feat)



                     feat       imp
0             next_order  0.507007
8               pts_1000  0.249873
1   next_park_factor_SLG  0.044487
12        park_ratio_SLG  0.036567
2   next_park_factor_OBP  0.026496
6                 HR_avg  0.024911
4                 SB_avg  0.018828
10             ratio_250  0.014129
3                  S_avg  0.012908
9              ratio_100  0.012462
13        park_ratio_OBP  0.009856
5                 BB_avg  0.007936
14             pts_500_p  0.007694
15             whip_SO_p  0.007293
18      park_ratio_OBP_p  0.005305
16            ratio_2000  0.004227
11             ratio_500  0.003679
17      park_ratio_SLG_p  0.003322
7                 CS_avg  0.003020


In [7]:
y_pred_train = GBR.predict(X_train)

In [9]:
pred_train = pd.DataFrame({'pred_train': y_pred_train, 'actual': y_train, 'year': df['year'][(df['year'] < 2017) & (df['year'] >= 2008)]})

# Calculate residuals, MAE, and RMSE 
pred_train['residuals'] = pred_train['actual'] - pred_train['pred_train']
pred_train['MAE'] = pred_train['residuals'].abs()
pred_train['RMSE'] = pred_train['residuals']**2

# Calculate correlation, group it by year
cor = pred_train[['actual','pred_train','year']].groupby('year').corr().reset_index()
cor = cor[cor['level_1'] == 'actual'][['year','pred_train']]

# Add year column
cor.set_index('year', inplace = True)

# Add correlation
cor.columns = ['correlation']

# Put all metrics together per year and total
cor.loc['Total','correlation'] =  pred_train[['actual','pred_train']].corr().iloc[0, 1]
err = pred_train[['year','MAE','RMSE']].groupby('year').mean()
err['RMSE'] = err['RMSE']**0.5
err.loc['Total', 'MAE'] = pred_train['residuals'].abs().mean()
err.loc['Total', 'RMSE'] = pred_train['RMSE'].mean()**0.5
cor = cor.join(err)

# Print result
print(cor)

       correlation       MAE      RMSE
year                                  
2008      0.189356  7.207799  9.458398
2009      0.183573  7.233854  9.483906
2010      0.191357  7.061317  9.231533
2011      0.197084  7.004174  9.097591
2012      0.194300  7.110968  9.253560
2013      0.180551  6.980828  9.061074
2014      0.188184  6.850441  8.829321
2015      0.190091  7.039045  9.187790
2016      0.186370  7.296233  9.540896
Total     0.190549  7.087088  9.240598
