In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn import ensemble
from sklearn.inspection import permutation_importance

In [2]:
df = pd.read_csv('WC-stats.csv')


In [3]:
df.head()

Unnamed: 0,goals_z,xg_z,crosses_z,boxtouches_z,passes_z,progpasses_z,takeons_z,progruns_z,tackles_z,interceptions_z,clearances_z,blocks_z,aerials_z,fouls_z,fouled_z,nsxg_z,results
0,0.423077,0.146923,-0.136154,-0.03,0.429231,0.037692,0.244615,-0.22,0.216154,0.27,-0.076923,-0.097692,-0.02,-0.224615,0.100769,-0.124615,2
1,0.216923,0.348462,0.031538,0.158462,0.835385,0.626923,0.27,0.266923,1.143846,0.834615,-0.059231,-0.134615,0.196923,-0.016923,0.033846,0.146923,1
2,0.113846,0.392308,0.599231,0.513846,0.833077,0.405385,0.521538,1.139231,1.109231,0.859231,0.084615,0.103077,0.683846,0.213846,0.353846,0.786923,9
3,0.479231,0.609231,0.227692,0.450769,0.770769,0.042308,0.337692,0.927692,0.506923,1.015385,0.020769,0.381538,0.038462,0.039231,0.022308,0.692308,5
4,0.877692,0.773846,0.428462,0.659231,0.754615,0.335385,0.023077,0.638462,0.493846,0.637692,-0.117692,-0.033846,0.572308,-0.016154,-0.096923,0.890769,5


In [4]:
df.columns

Index(['goals_z', 'xg_z', 'crosses_z', 'boxtouches_z', 'passes_z',
       'progpasses_z', 'takeons_z', 'progruns_z', 'tackles_z',
       'interceptions_z', 'clearances_z', 'blocks_z', 'aerials_z', 'fouls_z',
       'fouled_z', 'nsxg_z', 'results'],
      dtype='object')

In [5]:
y = df['results']
X = df.drop(columns=['results'])

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.10, random_state=2)

In [6]:
params = {
    "n_estimators": 3990,
    "max_depth": 400,
    "min_samples_split":4,
    "learning_rate": 0.0159,
    "loss": "squared_error",
    "max_features": 16,
    "min_weight_fraction_leaf": 0.11,
#     "subsample": 0.8
    "min_impurity_decrease": 5,
    
}

In [7]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

print('GRADIENT BOOSTING REGRESSION')

test_pred = reg.predict(X_test)

print('RMSE: '+str(np.sqrt(mean_squared_error(y_test, test_pred))))
print('R Squared: '+str(r2_score(y_test, test_pred)))
print('MSE TEST: '+str(mean_squared_error(y_test, test_pred)))
print('MAE TEST: '+str(mean_absolute_error(y_test, test_pred)))
print('MAPE: '+str(mean_absolute_percentage_error(y_test, test_pred)))


GRADIENT BOOSTING REGRESSION
RMSE: 5.406510252679109
R Squared: 0.6078736995531187
MSE TEST: 29.23035311232432
MAE TEST: 3.86815038340844
MAPE: 0.2777979228251335


In [8]:
# mape = 1.0765
# mape = 1.0647916900630887 max_feat = 16 
# mape 1.059828673697475 / estimators 5000
# mape 1.0597337473020356 / estimators 4000
# mape 1.0597326684323638 / estimators 4990
# mape 1.0432022275261308 / min fraction leaf .12
#mape 1.031433617032885 / '' ''     ''   '' .11
# ** Results vary -mape .9879763523310932 / "subsample": 0.8
# mpae  1.0241404892305128 / min_impurity_decrease 10
# mape 1.0097789777918718 / min decrease 5

In [9]:
data = {'y_test': y_test, 'y_pred': test_pred}

In [10]:
new_df = pd.DataFrame(data)

In [11]:
diff = (new_df['y_test'] - new_df['y_pred']).abs()

In [12]:
new_df['diff'] = diff
new_df = new_df.sort_values(by='diff', ascending=True)
new_df['y_pred'] = round(new_df['y_pred'], 1)
new_df['rounded'] = round(new_df['y_pred'])
new_df['new_diff'] = (new_df['y_test'] - new_df['rounded']).abs()

In [13]:
new_df

Unnamed: 0,y_test,y_pred,diff,rounded,new_diff
147,23,23.1,0.067273,23.0,0.0
247,20,20.3,0.256652,20.0,0.0
35,6,6.3,0.269581,6.0,0.0
84,23,23.4,0.442826,23.0,0.0
3,5,4.2,0.816553,4.0,1.0
184,21,21.9,0.920504,22.0,1.0
257,24,25.0,1.039204,25.0,1.0
175,19,17.6,1.361546,18.0,1.0
187,13,14.8,1.807655,15.0,2.0
71,12,10.1,1.915614,10.0,2.0


In [14]:
new_df['diff'].mean()

3.86815038340844

In [15]:
new_df['diff'].median()

2.7078281820908217

In [16]:
new_df['new_diff'].mean()


3.8518518518518516

In [17]:
new_df['new_diff'].median()

3.0

In [25]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_split=3)

sfs = SequentialFeatureSelector(rr, features_to_select=17, 
                                direction="forward", cv=split, 
                                n_jobs=4)


TypeError: __init__() got an unexpected keyword argument 'n_split'

In [23]:
df.shape

(267, 17)