# Compute VAEP Features for Players (Section 5.3)
Uses code from the socceraction library: https://socceraction.readthedocs.io/en/latest/

In [None]:
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab
import socceraction.vaep.formula as vaepformula
import pandas as pd
import numpy as np
import xgboost
from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss

In [None]:
games = pd.read_csv('overview_data/games_data.csv')
teams = pd.read_csv('overview_data/teams.csv')
players_df = pd.read_csv('overview_data/all_players.csv')
games['game_date'] = pd.to_datetime(games['game_date'])
games_with_team_names = pd.merge(games.rename(columns={'home_team_id':'team_id'}),teams,how='left',on='team_id')
games_with_team_names = games_with_team_names.rename(columns={'team_id':'home_team_id','team_name':'home_team_name'})
games_with_team_names = pd.merge(games_with_team_names.rename(columns={'away_team_id':'team_id'}),teams,how='left',on='team_id')
games_with_team_names = games_with_team_names.rename(columns={'team_id':'away_team_id','team_name':'away_team_name'})

## Setup the VAEP model to get VAEP of players

In [None]:
xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    fs.bodypart,
    fs.bodypart_onehot,
    fs.result,
    fs.result_onehot,
    fs.goalscore,
    fs.startlocation,
    fs.endlocation,
    fs.movement,
    fs.space_delta,
    fs.startpolar,
    fs.endpolar,
    fs.team,
    fs.time,
    fs.time_delta
]

for i,game in games_with_team_names.iterrows():
    events = pd.read_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'_events.csv')
    gamestates = fs.gamestates(spadl.add_names(events.drop(['player_name','is_starter','starting_position_id','starting_position_name','minutes_played','team_name'],axis=1)), 3)
    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
    #X.to_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'vaepX.csv', index=False)

In [None]:
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

for i,game in games_with_team_names.iterrows():
    events = pd.read_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'_events.csv')
    Y = pd.concat([fn(spadl.add_names(events.drop(['player_name','is_starter','starting_position_id','starting_position_name','minutes_played','team_name'],axis=1))) for fn in yfns], axis=1)
    #Y.to_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'vaepY.csv', index=False)

In [None]:
nb_prev_actions = 1
Xcols = fs.feature_column_names(xfns, nb_prev_actions)

def getXY(games,Xcols):
    # generate the columns of the selected feature
    X = []
    for i,game in games_with_team_names.iterrows():
        Xi = pd.read_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'vaepX.csv')
        X.append(Xi[Xcols])
    X = pd.concat(X).reset_index(drop=True)

    # 2. Select label Y
    Ycols = ["scores","concedes"]
    Y = []
    for i,game in games_with_team_names.iterrows():
        Yi = pd.read_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'vaepY.csv')
        Y.append(Yi[Ycols])
    Y = pd.concat(Y).reset_index(drop=True)
    return X, Y

X, Y = getXY(games_with_team_names,Xcols)
print("X:", list(X.columns))
print("Y:", list(Y.columns))

In [None]:
Y_hat = pd.DataFrame()
models = {}
for col in list(Y.columns):
    model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1)
    model.fit(X, Y[col])
    models[col] = model

In [None]:
testX, testY = X, Y

def evaluate(y, y_hat):
    p = sum(y) / len(y)
    base = [p] * len(y)
    brier = brier_score_loss(y, y_hat)
    print(f"  Brier score: %.5f (%.5f)" % (brier, brier / brier_score_loss(y, base)))
    ll = log_loss(y, y_hat)
    print(f"  log loss score: %.5f (%.5f)" % (ll, ll / log_loss(y, base)))
    print(f"  ROC AUC: %.5f" % roc_auc_score(y, y_hat))

for col in testY.columns:
    Y_hat[col] = [p[1] for p in models[col].predict_proba(testX)]
    print(f"### Y: {col} ###")
    evaluate(testY[col], Y_hat[col])

In [None]:
A = []
for i,game in games_with_team_names.iterrows():
    Ai = pd.read_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'_events.csv')
    A.append(Ai[["game_id"]])
A = pd.concat(A)
A = A.reset_index(drop=True)

# concatenate action game id rows with predictions and save per game
grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby("game_id")
for k, df in tqdm.tqdm(grouped_predictions, desc="Saving predictions per game"):
    df = df.reset_index(drop=True)
    gid = df.game_id.loc[0]
    #df[Y_hat.columns].to_csv('game_data/'+str(gid)+'/'+str(gid)+'vaep_preds.csv',index=False)

In [None]:
A = []
for i,game in games_with_team_names.iterrows():
    actions = pd.read_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'_events.csv')
    preds = pd.read_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'vaep_preds.csv')
    actions['result_name'] = np.where(actions['result_id'] == 0, 'fail', 'success')
    values = vaepformula.value(actions, preds.scores, preds.concedes)
    A.append(pd.concat([actions, preds, values], axis=1))
A = pd.concat(A).sort_values(["game_id", "period_id", "time_seconds"]).reset_index(drop=True)
A.columns

## Computes and stores VAEP values for each player in a real-world game

In [None]:
A["count"] = 1

for i, game in games_with_team_names.iterrows():
    lineup = pd.read_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'_lineup.csv')
    # Compute each player's number of actions and total VAEP values
    playersR = (
        A[A['game_id']==game.game_id][["player_id", "vaep_value", "offensive_value", "defensive_value", "count"]]
        .groupby(["player_id"])
        .sum()
        .reset_index()
    )
    # Add player names
    playersR = playersR.merge(players_df[["player_id", "player_name"]], how="left")
    # Show results
    playersR = playersR[["player_id", "player_name", "vaep_value", "offensive_value", "defensive_value"]]
    lineup = lineup.merge(playersR, how='left')
    lineup['vaep_pmin'] = lineup['vaep_value'] / lineup['minutes_played']
    lineup['off_vaep_pmin'] = lineup['offensive_value'] / lineup['minutes_played']
    lineup['def_vaep_pmin'] = lineup['defensive_value'] / lineup['minutes_played']
    lineup.to_csv('game_data/'+str(game.game_id)+'/'+str(game.game_id)+'_lineup_vaep.csv')