In [15]:
from socceraction.data.wyscout import PublicWyscoutLoader
from socceraction.spadl.wyscout import convert_to_actions as convert_to_actions_wyscout
from socceraction.spadl.statsbomb import convert_to_actions as convert_to_actions_statsbomb
from socceraction.data.opta import OptaLoader
from socceraction.data.statsbomb import StatsBombLoader
from socceraction.spadl.config import actiontypes, bodyparts
import socceraction.spadl as spadl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, brier_score_loss, log_loss, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, balanced_accuracy_score
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier
import math
import pickle
import os
from name_matching.name_matcher import NameMatcher
from rapidfuzz import fuzz
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import r_regression, SelectKBest, chi2, mutual_info_classif, SequentialFeatureSelector, RFECV, SelectFromModel, mutual_info_regression, f_regression
from scipy.stats import pearsonr, chisquare
from mrmr import mrmr_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import LinearSVR, SVC
from sklearn.linear_model import Lasso, LogisticRegression
import socceraction.vaep.labels as lab
import socceraction.vaep.features as fs
import socceraction.spadl as spadl
from datetime import timedelta
import socceraction.vaep.formula as vaepformula

In [16]:
api = StatsBombLoader(root="data/statsbomb", getter="local")

In [17]:
# Competition which has 360 files data (UEFA Euro 2020 (55, 43) and FIFA World Cup 2022 (43, 106))
MAPS_COMPETITION_CONTAINS_360_DATA = {
    (55, 43) : "UEFA Euro 2020",
    (43, 106) : "FIFA World Cup 2022"
}
MAPS_PLAYER_ID_WITH_VAEP_VALUE = {
    (55, 43) : {},
    (43, 106) : {}
}
POSSIBLE_SCENARIO_OPTION = ["best"]
MAPS_PROPOSED_PLAYER_RANKING_GENERATOR = {
    "xpass" : {
        "model_directory" : "data/model_xpass/",
        "type_ids_spadl" : [0,1],
        "action_name" : "pass"
    },
    "xgoal" : {
        "model_directory" : "data/model_xgoal/",
        "type_ids_spadl" : [11, 12, 13],
        "action_name" : "goal"
    },
    "xdribble" : {
        "model_directory" : "data/model_xdribble/",
        "type_ids_spadl" : [7, 21],
        "action_name" : "dribble"
    }
}
DIRECTORY_FINAL_PLAYERS_CSV_DATAS = "data/players_skill_dataset/final_players_skill_dataset.csv"
DIRECTORY_PLAYERS_DATA = "data/players_skill_dataset/"

In [18]:
# VAEP Processing Algorithm
def process_vaep_algorithm(spadl_game_df, referenced_team_id):
    # 1. convert actions to game states
    home_team_id = referenced_team_id
    gamestates = fs.gamestates(spadl_game_df, 3)
    gamestates = fs.play_left_to_right(gamestates, home_team_id)

    # 2. compute features
    xfns = [
        fs.actiontype, 
        fs.result, 
        fs.bodypart, 
        fs.time, 
        fs.startlocation, 
        fs.endlocation,
        fs.startpolar,
        fs.endpolar,
        fs.movement,
        fs.team,
        fs.time_delta,
        fs.space_delta,
        fs.goalscore
    ]
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)

    # 3. compute labels
    yfns = [lab.scores, lab.concedes]
    Y = pd.concat([fn(spadl_game_df) for fn in yfns], axis=1)

    # 4. load or train models
    Y_hat = pd.DataFrame()
    models = {}
    for col in list(Y.columns):
        model = XGBClassifier(objective="binary:logistic")
        model.fit(X, Y[col])
        models[col] = model

    # 5. predict scoring and conceding probabilities for each game state
    for col in list(Y.columns):
        Y_hat[col] = [p[1] for p in models[col].predict_proba(X)]
    
    # 6. compute VAEP value
    values = vaepformula.value(spadl_game_df, Y_hat["scores"], Y_hat["concedes"])
    
    return values['vaep_value']

In [19]:
def initialize_maps_player_id_with_vaep_value():
    for competition_id, season_id in list(MAPS_COMPETITION_CONTAINS_360_DATA.keys()):
        games_df = api.games(competition_id, season_id)
        for _, row in games_df.iterrows():
            players_df = api.players(row['game_id'])
            for _, row_player in players_df.iterrows():
                player_ids = list(MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)].keys())
                if row_player['player_id'] not in player_ids:
                    MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)][row_player['player_id']] = {
                        "num_minutes_played" : 0,
                        "total_vaep_value" : 0,
                        "total_contribution_score" : 0
                    }
                else:
                    MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)] \
                        [row_player['player_id']]["num_minutes_played"] += row_player['minutes_played']

In [20]:
# COMMENT IT IF VAEP PLAYER RANKING ALREADY LOADED
initialize_maps_player_id_with_vaep_value()
for competition_id, season_id in list(MAPS_PLAYER_ID_WITH_VAEP_VALUE.keys()):
    print("Competition Name : ", MAPS_COMPETITION_CONTAINS_360_DATA[(competition_id, season_id)])
    for player_id in list(MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)].keys()):
        print(f'Player ID : {player_id}')
        print(f'Num Minutes Played : {MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)][player_id]["num_minutes_played"]}')
    print("=================================================================")

Competition Name :  UEFA Euro 2020
Player ID : 3477
Num Minutes Played : 413
Player ID : 3957
Num Minutes Played : 293
Player ID : 4353
Num Minutes Played : 541
Player ID : 5199
Num Minutes Played : 399
Player ID : 5203
Num Minutes Played : 289
Player ID : 5208
Num Minutes Played : 70
Player ID : 5211
Num Minutes Played : 463
Player ID : 6685
Num Minutes Played : 158
Player ID : 6720
Num Minutes Played : 227
Player ID : 6748
Num Minutes Played : 284
Player ID : 6765
Num Minutes Played : 236
Player ID : 6766
Num Minutes Played : 225
Player ID : 6840
Num Minutes Played : 230
Player ID : 6892
Num Minutes Played : 286
Player ID : 11748
Num Minutes Played : 541
Player ID : 16532
Num Minutes Played : 316
Player ID : 30486
Num Minutes Played : 541
Player ID : 3533
Num Minutes Played : 295
Player ID : 5537
Num Minutes Played : 181
Player ID : 5538
Num Minutes Played : 30
Player ID : 5543
Num Minutes Played : 308
Player ID : 5544
Num Minutes Played : 375
Player ID : 5545
Num Minutes Played : 36

In [21]:
def construct_maps_player_id_with_vaep_value():
    for competition_id, season_id in list(MAPS_COMPETITION_CONTAINS_360_DATA.keys()):
        games_df = api.games(competition_id, season_id)
        for _, row in games_df.iterrows():
            # Looking for player ids for each home team id and away team id
            players_df = api.players(row['game_id'])
            list_player_ids_home_team = pd.Series(players_df[players_df['team_id'] == row['home_team_id']]['player_id']).tolist()
            list_player_ids_away_team = pd.Series(players_df[players_df['team_id'] == row['away_team_id']]['player_id']).tolist()
            # Loads event value
            this_game_events_df = api.events(row['game_id'], load_360=True)
            # Loads vaep value for home team id
            this_game_events_df_home_team = spadl.add_names(convert_to_actions_statsbomb(this_game_events_df, row['home_team_id']))
            this_game_events_df_home_team['vaep_value'] = process_vaep_algorithm(this_game_events_df_home_team, row['home_team_id'])
            player_id_and_vaep_value_df = this_game_events_df_home_team.groupby('player_id')['vaep_value'].sum()
            for player_id in list_player_ids_home_team:
                vaep_value = player_id_and_vaep_value_df[player_id] if player_id in player_id_and_vaep_value_df.index else 0
                MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)] \
                    [player_id]["total_vaep_value"] += vaep_value
            # Loads vaep value for away team id
            this_game_events_df_away_team = spadl.add_names(convert_to_actions_statsbomb(this_game_events_df, row['away_team_id']))
            this_game_events_df_away_team['vaep_value'] = process_vaep_algorithm(this_game_events_df_away_team, row['away_team_id'])
            player_id_and_vaep_value_df = this_game_events_df_away_team.groupby('player_id')['vaep_value'].sum()
            for player_id in list_player_ids_away_team:
                vaep_value = player_id_and_vaep_value_df[player_id] if player_id in player_id_and_vaep_value_df.index else 0
                MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)] \
                    [player_id]["total_vaep_value"] += vaep_value

In [22]:
# COMMENT IT IF VAEP PLAYER RANKING ALREADY LOADED
# construct_maps_player_id_with_vaep_value()
# for competition_id, season_id in list(MAPS_PLAYER_ID_WITH_VAEP_VALUE.keys()):
#     print("Competition Name : ", MAPS_COMPETITION_CONTAINS_360_DATA[(competition_id, season_id)])
#     for player_id in list(MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)].keys()):
#         print(f'Player ID : {player_id}')
#         print(f'Total VAEP Value : {MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)][player_id]["total_vaep_value"]}')
#     print("=================================================================")

In [23]:
def export_players_vaep_final_ranking_per_competition():
    final_player_skills_dataset = pd.read_csv(DIRECTORY_FINAL_PLAYERS_CSV_DATAS)
    for competition_id, season_id in list(MAPS_PLAYER_ID_WITH_VAEP_VALUE.keys()):
        list_player_ids_this_competition = list(MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)].keys())
        player_datas_this_competition = final_player_skills_dataset[final_player_skills_dataset['player_id'].isin(list_player_ids_this_competition)]
        player_datas_this_competition['statistic_minutes_played'] = np.zeros(player_datas_this_competition.shape[0])
        player_datas_this_competition['statistic_vaep_value'] = np.zeros(player_datas_this_competition.shape[0])
        player_datas_this_competition['statistic_rating_vaep_value'] = np.zeros(player_datas_this_competition.shape[0])
        for player_id in list_player_ids_this_competition:
            num_minutes_played = MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)][player_id]['num_minutes_played']
            total_vaep_value = MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)][player_id]['total_vaep_value']
            statistic_rating_vaep_value = 0 if (num_minutes_played == 0) or (total_vaep_value == 0) else (90 / num_minutes_played) * total_vaep_value

            player_datas_this_competition.loc[player_datas_this_competition['player_id'] == player_id, ['statistic_minutes_played']] \
                = num_minutes_played
            player_datas_this_competition.loc[player_datas_this_competition['player_id'] == player_id, ['statistic_vaep_value']] \
                = total_vaep_value
            player_datas_this_competition.loc[player_datas_this_competition['player_id'] == player_id, ['statistic_rating_vaep_value']] \
                = statistic_rating_vaep_value
                
        player_datas_this_competition.sort_values(by="statistic_rating_vaep_value", ascending=False, inplace=True)
        player_datas_this_competition.reset_index(drop=True, inplace=True)
        filename_vaep_ranking_this_competition = f'vaep_players_ranking_for_competition_{MAPS_COMPETITION_CONTAINS_360_DATA[(competition_id, season_id)]}.csv'
        player_datas_this_competition.to_csv(DIRECTORY_PLAYERS_DATA + filename_vaep_ranking_this_competition)
            
            

In [24]:
# COMMENT IT IF VAEP PLAYER RANKING ALREADY LOADED
# export_players_vaep_final_ranking_per_competition()

In [25]:
# Construct dataframe containing competition name and related event id 
COLUMNS_COMPETITION_NAME_AND_EVENT_ID_RESULT = [
    "competition_id",
    "season_id",
    "competition_name",
    "event_id_related"
]

def construct_df_containing_competition_name_and_event_id():
    for competition_id, season_id in list(MAPS_COMPETITION_CONTAINS_360_DATA.keys()):
        games_df = api.games(competition_id, season_id)
        competition_name = MAPS_COMPETITION_CONTAINS_360_DATA[(competition_id, season_id)]
        list_game_events_df = []
        for _, row in games_df.iterrows():
            this_game_events_df = api.events(row['game_id'], load_360=True)
            this_game_events_df = spadl.add_names(convert_to_actions_statsbomb(this_game_events_df, row['home_team_id']))
            list_game_events_df.append(this_game_events_df)
        all_games_df_this_competition = pd.concat(list_game_events_df)
        for key in list(MAPS_PROPOSED_PLAYER_RANKING_GENERATOR.keys()):
            list_type_ids = MAPS_PROPOSED_PLAYER_RANKING_GENERATOR[key]['type_ids_spadl']
            model_directory = MAPS_PROPOSED_PLAYER_RANKING_GENERATOR[key]['model_directory']
            all_games_df_this_type = all_games_df_this_competition[all_games_df_this_competition['type_id'].isin(list_type_ids)]
            unique_original_event_ids = all_games_df_this_type['original_event_id'].unique()

            empty_df_result = pd.DataFrame(columns=COLUMNS_COMPETITION_NAME_AND_EVENT_ID_RESULT, index=[0])
            for event_id in unique_original_event_ids:
                maps_new_row = {
                    "competition_id" : competition_id,
                    "season_id" : season_id,
                    "competition_name" : competition_name,
                    "event_id_related" : event_id
                }  
                new_row = pd.DataFrame(maps_new_row, index=[0])
                empty_df_result = pd.concat([new_row, empty_df_result.loc[:]]).reset_index(drop=True)
            empty_df_result.dropna(inplace=True)
            filename_result = f'{key}_map_competition_and_event_id_for_{competition_name}.csv'
            empty_df_result.to_csv(model_directory + filename_result)

In [26]:
# COMMENT IT IF ALREADY CONSTRUCTED !!
# construct_df_containing_competition_name_and_event_id()

In [27]:
def construct_final_ranking_players_proposed_method():
    final_player_skills_dataset = pd.read_csv(DIRECTORY_FINAL_PLAYERS_CSV_DATAS)
    for scenario_opt in POSSIBLE_SCENARIO_OPTION:
        for competition_id, season_id in list(MAPS_PLAYER_ID_WITH_VAEP_VALUE.keys()):
            competition_name = MAPS_COMPETITION_CONTAINS_360_DATA[(competition_id, season_id)]
            list_player_ids_this_competition = list(MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)].keys())
            for key in list(MAPS_PROPOSED_PLAYER_RANKING_GENERATOR.keys()):
                action_name = MAPS_PROPOSED_PLAYER_RANKING_GENERATOR[key]["action_name"]
                model_directory = MAPS_PROPOSED_PLAYER_RANKING_GENERATOR[key]["model_directory"]
                filename_ranking_this_directory = f'{key}_contribution_score_player_{scenario_opt}_scenario_{competition_name}.csv'
                player_score_contributions_df = pd.read_csv(model_directory + filename_ranking_this_directory)
                for player_id in list_player_ids_this_competition:
                    this_player_score_df = player_score_contributions_df.loc[player_score_contributions_df['player_id'] == player_id, [f'statistic_{action_name}_contribution']]
                    if (not this_player_score_df.empty):
                        MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)][player_id]["total_contribution_score"] += this_player_score_df[f'statistic_{action_name}_contribution'].iloc[0]
            
            player_datas_this_competition = final_player_skills_dataset[final_player_skills_dataset['player_id'].isin(list_player_ids_this_competition)]
            player_datas_this_competition['statistic_minutes_played'] = np.zeros(player_datas_this_competition.shape[0])
            player_datas_this_competition['statistic_contribution_value'] = np.zeros(player_datas_this_competition.shape[0])
            for player_id in list_player_ids_this_competition:
                num_minutes_played = MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)][player_id]['num_minutes_played']
                total_contribution_value = MAPS_PLAYER_ID_WITH_VAEP_VALUE[(competition_id, season_id)][player_id]['total_contribution_score']
                statistic_rating_contribution_value = 0 if (num_minutes_played == 0) or (total_contribution_value == 0) else (90 / num_minutes_played) * total_contribution_value

                player_datas_this_competition.loc[player_datas_this_competition['player_id'] == player_id, ['statistic_minutes_played']] \
                    = num_minutes_played
                player_datas_this_competition.loc[player_datas_this_competition['player_id'] == player_id, ['statistic_contribution_value']] \
                    = total_contribution_value
                player_datas_this_competition.loc[player_datas_this_competition['player_id'] == player_id, ['statistic_rating_contribution_value']] \
                    = statistic_rating_contribution_value
                    
            player_datas_this_competition.sort_values(by="statistic_rating_contribution_value", ascending=False, inplace=True)
            player_datas_this_competition.reset_index(drop=True, inplace=True)
            filename_vaep_ranking_this_competition = f'proposed_players_ranking_{scenario_opt}_scenario_for_competition_{competition_name}.csv'
            player_datas_this_competition.to_csv(DIRECTORY_PLAYERS_DATA + filename_vaep_ranking_this_competition)



In [28]:
construct_final_ranking_players_proposed_method()