In [1]:
import os
import sys
import random
import warnings
import tqdm
import numpy as np
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
%load_ext autoreload
%autoreload 2

# 現在のファイルのディレクトリを取得
current_directory = os.path.dirname(os.path.abspath("__file__"))

# 親ディレクトリを取得
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))

# 親ディレクトリをPythonのモジュール検索パスに追加
sys.path.append(parent_directory)

import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

In [3]:
random.seed(0)
np.random.seed(0)

## Select data

In [4]:
experiment_date = 202406141613

In [5]:
# Configure file and folder names
datafolder = f"../data-euro2020/{experiment_date}"
spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")
features_h5 = os.path.join(datafolder, "features.h5")
labels_h5 = os.path.join(datafolder, "labels.h5")

In [6]:
games = pd.read_hdf(spadl_h5, "games")
print("nb of games:", len(games))

nb of games: 51


## Compute features

In [7]:
xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    fs.bodypart,
    fs.bodypart_onehot,
    fs.result,
    fs.result_onehot,
    fs.goalscore,
    fs.startlocation,
    fs.endlocation,
    fs.movement,
    fs.space_delta,
    fs.startpolar,
    fs.endpolar,
    fs.team,
    fs.time,
    fs.time_delta,
    fs.away_team,  # add
    fs.player_loc_dist,  # add
    fs.gain,  # add
    fs.penetration,  # add
]

with pd.HDFStore(spadl_h5) as spadlstore, pd.HDFStore(features_h5) as featurestore:
    for game in tqdm.tqdm(list(games.itertuples()), desc=f"Generating and storing features in {features_h5}"):
        actions = spadlstore[f"actions/game_{game.game_id}"]
        gamestates = fs.gamestates(spadl.add_names(actions), 3)
        X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
        featurestore.put(f"game_{game.game_id}", X, format='table')

Generating and storing features in ../data-euro2020/202406141613/features.h5: 100%|██████████| 51/51 [01:13<00:00,  1.45s/it]


## Compute labels

In [8]:
yfns = [
        lab.gains,
        lab.effective_attack,
        lab.scores,
        lab.concedes
        ]

with pd.HDFStore(spadl_h5) as spadlstore, pd.HDFStore(labels_h5) as labelstore:
    for game in tqdm.tqdm(list(games.itertuples()), desc=f"Computing and storing labels in {labels_h5}"):
        actions = spadlstore[f"actions/game_{game.game_id}"]
        Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
        labelstore.put(f"game_{game.game_id}", Y, format='table')

Computing and storing labels in ../data-euro2020/202406141613/labels.h5: 100%|██████████| 51/51 [00:17<00:00,  2.96it/s]
