https://scikit-learn.org/stable/auto_examples/linear_model/plot_poisson_regression_non_normal_loss.html#sphx-glr-auto-examples-linear-model-plot-poisson-regression-non-normal-loss-py

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv('WC-stats.csv')

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In order to fit linear models with those predictors it is therefore necessary to perform standard feature transformations as follows:

In [39]:
df.columns

Index(['goals_z', 'xg_z', 'crosses_z', 'boxtouches_z', 'passes_z',
       'progpasses_z', 'takeons_z', 'progruns_z', 'tackles_z',
       'interceptions_z', 'clearances_z', 'blocks_z', 'aerials_z', 'fouls_z',
       'fouled_z', 'nsxg_z', 'results'],
      dtype='object')

In [40]:
log_scale_transformer = make_pipeline(
    FunctionTransformer(np.log, validate=False), StandardScaler()
)

linear_model_preprocessor = ColumnTransformer(
    [
        ("passthrough_numeric", "passthrough", ["goals_z"]),
        ("binned_numeric", KBinsDiscretizer(n_bins=10), ['xg_z', 'crosses_z', 'boxtouches_z', 'passes_z',
       'progpasses_z', 'takeons_z', 'progruns_z', 'tackles_z',
       'interceptions_z', 'clearances_z', 'blocks_z', 'aerials_z', 'fouls_z',
       'fouled_z', 'nsxg_z']),
        ("log_scaled_numeric", log_scale_transformer, ["results"]),
    ],
    remainder="drop",
)

To evaluate the pertinence of the used metrics, we will consider as a baseline a “dummy” estimator that constantly predicts the mean frequency of the training sample.



In [41]:
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [45]:
df.head()

Unnamed: 0,goals_z,xg_z,crosses_z,boxtouches_z,passes_z,progpasses_z,takeons_z,progruns_z,tackles_z,interceptions_z,clearances_z,blocks_z,aerials_z,fouls_z,fouled_z,nsxg_z,results
0,0.423077,0.146923,-0.136154,-0.03,0.429231,0.037692,0.244615,-0.22,0.216154,0.27,-0.076923,-0.097692,-0.02,-0.224615,0.100769,-0.124615,2.0
1,0.216923,0.348462,0.031538,0.158462,0.835385,0.626923,0.27,0.266923,1.143846,0.834615,-0.059231,-0.134615,0.196923,-0.016923,0.033846,0.146923,1.0
2,0.113846,0.392308,0.599231,0.513846,0.833077,0.405385,0.521538,1.139231,1.109231,0.859231,0.084615,0.103077,0.683846,0.213846,0.353846,0.786923,9.0
3,0.479231,0.609231,0.227692,0.450769,0.770769,0.042308,0.337692,0.927692,0.506923,1.015385,0.020769,0.381538,0.038462,0.039231,0.022308,0.692308,5.0
4,0.877692,0.773846,0.428462,0.659231,0.754615,0.335385,0.023077,0.638462,0.493846,0.637692,-0.117692,-0.033846,0.572308,-0.016154,-0.096923,0.890769,5.0


In [46]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 134 to 47
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   goals_z          124 non-null    float64
 1   xg_z             124 non-null    float64
 2   crosses_z        124 non-null    float64
 3   boxtouches_z     124 non-null    float64
 4   passes_z         124 non-null    float64
 5   progpasses_z     124 non-null    float64
 6   takeons_z        124 non-null    float64
 7   progruns_z       124 non-null    float64
 8   tackles_z        124 non-null    float64
 9   interceptions_z  124 non-null    float64
 10  clearances_z     124 non-null    float64
 11  blocks_z         124 non-null    float64
 12  aerials_z        124 non-null    float64
 13  fouls_z          124 non-null    float64
 14  fouled_z         124 non-null    float64
 15  nsxg_z           124 non-null    float64
 16  results          124 non-null    float64
dtypes: float64(17)


In [47]:
df['results'] = df['results'].astype(float)


In [48]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 134 to 47
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   goals_z          124 non-null    float64
 1   xg_z             124 non-null    float64
 2   crosses_z        124 non-null    float64
 3   boxtouches_z     124 non-null    float64
 4   passes_z         124 non-null    float64
 5   progpasses_z     124 non-null    float64
 6   takeons_z        124 non-null    float64
 7   progruns_z       124 non-null    float64
 8   tackles_z        124 non-null    float64
 9   interceptions_z  124 non-null    float64
 10  clearances_z     124 non-null    float64
 11  blocks_z         124 non-null    float64
 12  aerials_z        124 non-null    float64
 13  fouls_z          124 non-null    float64
 14  fouled_z         124 non-null    float64
 15  nsxg_z           124 non-null    float64
 16  results          124 non-null    float64
dtypes: float64(17)


In [49]:
df_train, df_test = train_test_split(df, test_size=0.21, random_state=0)


In [50]:

dummy = Pipeline(
    [
        ("preprocessor", linear_model_preprocessor),
        ("regressor", DummyRegressor(strategy="mean")),
    ]
).fit(df_train, df_train["goals_z"], regressor__sample_weight=df_train["results"])



In [51]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_poisson_deviance


In [53]:
def score_estimator(estimator, df_test):
    """Score an estimator on the test set."""
    y_pred = estimator.predict(df_test)

    print(
        "MSE: %.3f"
        % mean_squared_error(
            df_test["results"], y_pred, sample_weight=df_test["goals_z"]
        )
    )
    print(
        "MAE: %.3f"
        % mean_absolute_error(
            df_test["results"], y_pred, sample_weight=df_test["goals_z"]
        )
    )

    # Ignore non-positive predictions, as they are invalid for
    # the Poisson deviance.
    mask = y_pred > 0
    if (~mask).any():
        n_masked, n_samples = (~mask).sum(), mask.shape[0]
        print(
            "WARNING: Estimator yields invalid, non-positive predictions "
            f" for {n_masked} samples out of {n_samples}. These predictions "
            "are ignored when computing the Poisson deviance."
        )

    print(
        "mean Poisson deviance: %.3f"
        % mean_poisson_deviance(
            df_test["results"][mask],
            y_pred[mask],
            sample_weight=df_test["results"][mask],
        )
    )


print("Constant mean frequency evaluation:")
score_estimator(dummy, df_test)

Constant mean frequency evaluation:
MSE: -185.182
MAE: -1.506
mean Poisson deviance: 191.107


In [58]:
df_train['results'].value_counts()

7.0     8
11.0    7
3.0     7
2.0     7
4.0     6
1.0     6
5.0     6
10.0    6
9.0     5
24.0    5
12.0    5
23.0    5
28.0    5
18.0    4
6.0     4
14.0    4
29.0    3
32.0    3
13.0    3
21.0    3
15.0    3
26.0    3
30.0    2
8.0     2
25.0    2
31.0    2
16.0    2
17.0    2
22.0    1
27.0    1
19.0    1
20.0    1
Name: results, dtype: int64

In [60]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder


tree_preprocessor = ColumnTransformer(
    [
        ("numeric", "passthrough", ['xg_z', 'crosses_z', 'boxtouches_z', 'passes_z',
       'progpasses_z', 'takeons_z', 'progruns_z', 'tackles_z',
       'interceptions_z', 'clearances_z', 'blocks_z', 'aerials_z', 'fouls_z',
       'fouled_z', 'nsxg_z', 'results']),
    ],
    remainder="drop",
)



poisson_gbrt = Pipeline(
    [
        ("preprocessor", tree_preprocessor),
        (
            "regressor",
            HistGradientBoostingRegressor(loss="poisson", max_leaf_nodes=128),
        ),
    ]
)
poisson_gbrt.fit(
    df_train, df_train["results"]
)

print("Poisson Gradient Boosted Trees evaluation:")
score_estimator(poisson_gbrt, df_test)

Poisson Gradient Boosted Trees evaluation:
MSE: -1.557
MAE: -0.102
mean Poisson deviance: 0.126
