In [1]:
import os
import sys
import random
import warnings
import tqdm

import numpy as np
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
%load_ext autoreload
%autoreload 2

# 現在のファイルのディレクトリを取得
current_directory = os.path.dirname(os.path.abspath("__file__"))

# 親ディレクトリを取得
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))

# 親ディレクトリをPythonのモジュール検索パスに追加
sys.path.append(parent_directory)

import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

In [3]:
random.seed(0)
np.random.seed(0)

## Select data

In [4]:
experiment_date = 202406141613

In [5]:
# Configure file and folder names
datafolder = f"../data-euro2020/{experiment_date}"
spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")
features_h5 = os.path.join(datafolder, "features.h5")
labels_h5 = os.path.join(datafolder, "labels.h5")
predictions_h5 = os.path.join(datafolder, "predictions.h5")

In [6]:
games = pd.read_hdf(spadl_h5, "games")
print("nb of games:", len(games))

# note: only for the purpose of this example and due to the small dataset,
# we use the same data for training and evaluation
traingames = games
testgames = games

nb of games: 51


In [7]:
# 1. Select feature set X
xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    fs.bodypart_onehot,
    fs.startlocation,
    fs.endlocation,
    fs.movement,
    fs.space_delta,
    fs.team,
    fs.time_delta,
    fs.player_loc_dist,
]
nb_prev_actions = 1

Xcols = fs.feature_column_names(xfns, nb_prev_actions)

Xcols_vaep = Xcols

Xcols_vdep = [col for col in Xcols_vaep if "type" not in col]

def getXY(games, features_h5, labels_h5, Xcols, vaep=False):
    """
    Parameters
    ----------
    games: pd.DataFrame, 
        the games that you want to select.
    features_h5: str, 
        the path of the h5 file that contains the features.
    labels_h5: str, 
        the path of the h5 file that contains the labels.
    Xcols: list, 
        the columns of the features that you want to select.


    Returns
    -------
    X: pd.DataFrame, 
        the selected features.
    Y: pd.DataFrame, 
        the selected labels.
    drop_index: list, 
        the index of the rows that are dropped.
    """
    # generate the columns of the selected features and labels
    X = []
    for game_id in tqdm.tqdm(
        games.game_id, desc="Selecting features"
        ):
        Xi = pd.read_hdf(features_h5, f"game_{game_id}")
        X.append(Xi[Xcols])
    X = pd.concat(X).reset_index(drop=True)

    if vaep:
        Ycols = ["scores", "concedes"]
    else:
        Ycols = ["gains", "effective_attack"]
    Y = []
    for game_id in tqdm.tqdm(
        games.game_id, desc="Selecting label"
        ):
        Yi = pd.read_hdf(labels_h5, f"game_{game_id}")
        Y.append(Yi[Ycols])
    Y = pd.concat(Y).reset_index(drop=True)

    return X, Y

trainX_vaep, trainY_vaep = getXY(
    traingames, features_h5, labels_h5, Xcols_vaep, vaep=True
    )
trainX_vdep, trainY_vdep = getXY(
    traingames, features_h5, labels_h5, Xcols_vdep, vaep=False
    )
testX_vaep, testY_vaep = getXY(
    testgames, features_h5, labels_h5, Xcols_vaep, vaep=True
    )
testX_vdep, testY_vdep = getXY(
    testgames, features_h5, labels_h5, Xcols_vdep, vaep=False
    )

Selecting features: 100%|██████████| 51/51 [00:10<00:00,  4.85it/s]
Selecting label: 100%|██████████| 51/51 [00:00<00:00, 74.94it/s]
Selecting features: 100%|██████████| 51/51 [00:09<00:00,  5.38it/s]
Selecting label: 100%|██████████| 51/51 [00:00<00:00, 95.79it/s]
Selecting features: 100%|██████████| 51/51 [00:09<00:00,  5.49it/s]
Selecting label: 100%|██████████| 51/51 [00:00<00:00, 88.47it/s]
Selecting features: 100%|██████████| 51/51 [00:08<00:00,  5.75it/s]
Selecting label: 100%|██████████| 51/51 [00:00<00:00, 89.49it/s]


In [8]:
print(trainX_vaep.shape, trainY_vaep.shape)
print(trainX_vaep.columns)
print(trainY_vaep.columns)

(113010, 124) (113010, 2)
Index(['actiontype_a0', 'actiontype_pass_a0', 'actiontype_cross_a0',
       'actiontype_throw_in_a0', 'actiontype_freekick_crossed_a0',
       'actiontype_freekick_short_a0', 'actiontype_corner_crossed_a0',
       'actiontype_corner_short_a0', 'actiontype_take_on_a0',
       'actiontype_offensive_foul_a0',
       ...
       'dist_dfd9_a0', 'angle_dfd9_a0', 'atk10_x_a0', 'atk10_y_a0',
       'dist_atk10_a0', 'angle_atk10_a0', 'dfd10_x_a0', 'dfd10_y_a0',
       'dist_dfd10_a0', 'angle_dfd10_a0'],
      dtype='object', length=124)


In [9]:
print(trainX_vdep.shape, trainY_vdep.shape)
print(trainX_vdep.columns)
print(trainY_vdep.columns)

(113010, 99) (113010, 2)
Index(['bodypart_foot_a0', 'bodypart_head_a0', 'bodypart_other_a0',
       'bodypart_head/other_a0', 'start_x_a0', 'start_y_a0', 'end_x_a0',
       'end_y_a0', 'dx_a0', 'dy_a0', 'movement_a0', 'atk0_x_a0', 'atk0_y_a0',
       'dist_atk0_a0', 'angle_atk0_a0', 'dfd0_x_a0', 'dfd0_y_a0',
       'dist_dfd0_a0', 'angle_dfd0_a0', 'atk1_x_a0', 'atk1_y_a0',
       'dist_atk1_a0', 'angle_atk1_a0', 'dfd1_x_a0', 'dfd1_y_a0',
       'dist_dfd1_a0', 'angle_dfd1_a0', 'atk2_x_a0', 'atk2_y_a0',
       'dist_atk2_a0', 'angle_atk2_a0', 'dfd2_x_a0', 'dfd2_y_a0',
       'dist_dfd2_a0', 'angle_dfd2_a0', 'atk3_x_a0', 'atk3_y_a0',
       'dist_atk3_a0', 'angle_atk3_a0', 'dfd3_x_a0', 'dfd3_y_a0',
       'dist_dfd3_a0', 'angle_dfd3_a0', 'atk4_x_a0', 'atk4_y_a0',
       'dist_atk4_a0', 'angle_atk4_a0', 'dfd4_x_a0', 'dfd4_y_a0',
       'dist_dfd4_a0', 'angle_dfd4_a0', 'atk5_x_a0', 'atk5_y_a0',
       'dist_atk5_a0', 'angle_atk5_a0', 'dfd5_x_a0', 'dfd5_y_a0',
       'dist_dfd5_a0', 'angle_

## Train a model and Evaluate the model

In [10]:
from sklearn.metrics import brier_score_loss, confusion_matrix, f1_score, log_loss, roc_auc_score

def evaluate(y, y_hat):
    p = sum(y) / len(y)
    base = [p] * len(y)
    brier = brier_score_loss(y, y_hat)
    print(f"  Brier score: %.5f (%.5f)" % (brier, brier / brier_score_loss(y, base)))
    ll = log_loss(y, y_hat)
    print(f"  log loss score: %.5f (%.5f)" % (ll, ll / log_loss(y, base)))
    print(f"  ROC AUC: %.5f" % roc_auc_score(y, y_hat))
    y_hat_bi = y_hat.round()
    print(f"F1 score:{f1_score(y, y_hat_bi)}")
    print(confusion_matrix(y, y_hat_bi))

In [11]:
# 3. train classifiers F(X) = Y
import xgboost

Y_hat = pd.DataFrame()
models = {}
for col in list(trainY_vaep.columns):
    model = xgboost.XGBClassifier(
        n_estimators=50,
        max_depth=3,
        n_jobs=-3,
        verbosity=1,
        random_state=0,
        enable_categorical=True
        )
    model.fit(trainX_vaep, trainY_vaep[col])
    models[col] = model
    Y_hat[col] = model.predict(trainX_vaep)
    print(f"### Y: {col} ###")
    evaluate(testY_vaep[col], Y_hat[col])

for col in list(trainY_vdep.columns):
    model = xgboost.XGBClassifier(
        n_estimators=50,
        max_depth=3, 
        n_jobs=-3, 
        verbosity=1, 
        random_state=0,
        enable_categorical=True
        )
    model.fit(trainX_vdep, trainY_vdep[col])
    models[col] = model
    Y_hat[col] = model.predict(trainX_vdep)
    print(f"### Y: {col} ###")
    evaluate(testY_vdep[col], Y_hat[col])

### Y: scores ###
  Brier score: 0.01014 (0.97085)
  log loss score: 0.36551 (6.24333)
  ROC AUC: 0.52343
F1 score:0.0890302066772655
[[111808      9]
 [  1137     56]]
### Y: concedes ###
  Brier score: 0.00184 (0.89455)
  log loss score: 0.06634 (4.47941)
  ROC AUC: 0.56864
F1 score:0.23529411764705882
[[112770      7]
 [   201     32]]
### Y: gains ###


ValueError: y_proba contains values greater than 1.

## Save predictions

In [None]:
# get rows with game id per action
A = []
for game_id in tqdm.tqdm(games.game_id, "Loading game ids"):
    Ai = pd.read_hdf(spadl_h5, f"actions/game_{game_id}")
    A.append(Ai[["game_id"]])
A = pd.concat(A)
A = A.reset_index(drop=True)

# concatenate action game id rows with predictions and save per game
grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby("game_id")
with pd.HDFStore(predictions_h5) as predictionstore:
    for k, df in tqdm.tqdm(grouped_predictions, desc="Saving predictions per game"):
        df = df.reset_index(drop=True)
        predictionstore.put(f"game_{int(k)}", df[Y_hat.columns])

Loading game ids: 100%|██████████| 51/51 [00:03<00:00, 14.95it/s]
Saving predictions per game: 100%|██████████| 51/51 [00:00<00:00, 131.16it/s]


In [None]:
print(112869+1+105+106)
print(112208+6+204+29)

113081
112447
