In [271]:
from socceraction.data.wyscout import PublicWyscoutLoader
from socceraction.spadl.wyscout import convert_to_actions
from socceraction.data.opta import OptaLoader
from socceraction.data.statsbomb import StatsBombLoader
from socceraction.spadl.config import actiontypes, bodyparts
import socceraction.spadl as spadl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error
from xgboost import XGBClassifier, XGBRegressor
import math
import pickle
import os
from name_matching.name_matcher import NameMatcher
from rapidfuzz import fuzz

In [272]:
api_wyscout = PublicWyscoutLoader(root="data/wyscout")
api_opta = OptaLoader(root="data/opta")
# api_statsbomb = StatsBombLoader(root="data/statsbomb")

In [273]:
for idx, action_name in enumerate(actiontypes):
    print(f'action_id : {idx}   action_name : {action_name}')

action_id : 0   action_name : pass
action_id : 1   action_name : cross
action_id : 2   action_name : throw_in
action_id : 3   action_name : freekick_crossed
action_id : 4   action_name : freekick_short
action_id : 5   action_name : corner_crossed
action_id : 6   action_name : corner_short
action_id : 7   action_name : take_on
action_id : 8   action_name : foul
action_id : 9   action_name : tackle
action_id : 10   action_name : interception
action_id : 11   action_name : shot
action_id : 12   action_name : shot_penalty
action_id : 13   action_name : shot_freekick
action_id : 14   action_name : keeper_save
action_id : 15   action_name : keeper_claim
action_id : 16   action_name : keeper_punch
action_id : 17   action_name : keeper_pick_up
action_id : 18   action_name : clearance
action_id : 19   action_name : bad_touch
action_id : 20   action_name : non_action
action_id : 21   action_name : dribble
action_id : 22   action_name : goalkick


In [274]:
for idx, bodypart_name in enumerate(bodyparts):
    print(f'bodypart_id : {idx}   bodypart_name : {bodypart_name}')

bodypart_id : 0   bodypart_name : foot
bodypart_id : 1   bodypart_name : head
bodypart_id : 2   bodypart_name : other
bodypart_id : 3   bodypart_name : head/other
bodypart_id : 4   bodypart_name : foot_left
bodypart_id : 5   bodypart_name : foot_right


In [275]:
def convert_events_df_to_spadl(events_df, home_team_id):
    spadl_events_df = convert_to_actions(events_df, home_team_id)
    spadl_events_df['time_seconds'] = spadl_events_df['time_seconds'].astype('float64')
    spadl_events_df['timestamp'] = pd.to_datetime(spadl_events_df['time_seconds'], unit='s')
    spadl_events_df = spadl.play_left_to_right(spadl_events_df, home_team_id)
    return spadl_events_df

In [276]:
# FUNCTION TO ADD ADDITIONAL INFO IN RAW SPADL DATAFRAME
STANDARD_LENGTH_COURT = 105
STANDARD_WIDTH_COURT = 68
STANDARD_GOALLINE_WIDTH = 7.32

# Helper Functions
def filter_out_is_home_team_apply_df(row, home_team_id):
    return 1 if row['team_id'] == home_team_id else 0

def filter_out_take_on_or_dribble_apply_df(row, take_on_action_id):
    return 1 if row['action_id'] == take_on_action_id else 0

# Add is_home_team column (boolean 0/1)
def add_is_home_team_column_to_spadl_df(spadl_df, home_team_id):
    spadl_df['is_home_team'] = spadl_df.apply(lambda x : filter_out_is_home_team_apply_df(x, home_team_id), axis=1)
    return spadl_df

# Add is_take_on column (boolean 0/1)
def add_is_take_on_column_to_spadl_df(spadl_df, take_on_action_id):
    spadl_df['is_take_on'] = spadl_df.apply(lambda x : filter_out_take_on_or_dribble_apply_df(x, take_on_action_id), axis=1)
    return spadl_df

In [277]:
# Collect all dataset action specific type, export them to csv files
# Take_on (action_id = 7), Dribble (action_id = 21)
DRIBBLE_ACTION_ID = [7, 21] 
TAKE_ON_ACTION_ID = 7

def collect_raw_pass_spadl_df(source="Wyscout", period=1):
    api = api_wyscout
    list_competitions_ids = []
    list_game_ids = []

    competitions_df = api.competitions()
    for _, row in competitions_df.iterrows():
        list_competitions_ids.append((row['competition_id'], row['season_id']))
        
    for competition_id, season_id in list_competitions_ids:
        games_df = api.games(competition_id, season_id)
        for _, row in games_df.iterrows():
            list_game_ids.append((row['game_id'], row['home_team_id'], row['away_team_id']))
            
    for game_id, home_team_id, away_team_id in list_game_ids:
        this_game_events_df = api.events(game_id)
        this_game_events_spadl_df = convert_events_df_to_spadl(this_game_events_df, home_team_id)
        this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['type_id'].isin(DRIBBLE_ACTION_ID)]
        if (period != None):
            this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['period_id'] == period]
        else:
            this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['period_id'] == 1]
        # Add additional computed column to support xDribble model
        this_game_events_spadl_df = add_is_home_team_column_to_spadl_df(this_game_events_spadl_df, home_team_id)
        this_game_events_spadl_df = add_is_take_on_column_to_spadl_df(this_game_events_spadl_df, TAKE_ON_ACTION_ID)

        # Export to external csv iteratively
        this_game_events_spadl_df.to_csv(f'data/training_data_xdribble_wyscout/{game_id}_{home_team_id}_{away_team_id}_xdribble_data.csv')

In [278]:
# FUNCTIONS TO CREATE ALL DATASET PLAYERS
def collect_raw_all_players_df(source="Wyscout"):
    api = api_wyscout
    list_competitions_ids = []
    list_game_ids = []
    players_df_column = None

    competitions_df = api.competitions()
    for _, row in competitions_df.iterrows():
        list_competitions_ids.append((row['competition_id'], row['season_id']))
        
    for competition_id, season_id in list_competitions_ids:
        games_df = api.games(competition_id, season_id)
        for _, row in games_df.iterrows():
            list_game_ids.append((row['game_id'], row['home_team_id'], row['away_team_id']))

    for game_id, home_team_id, away_team_id in list_game_ids:
        players_df = api.players(game_id)
        players_df.to_csv(f'data/training_data_players_wyscout/{game_id}_{home_team_id}_{away_team_id}_players_data.csv')

def load_and_concat_players_df_from_csv(path_to_raw_players_df):
    list_raw_players_df = []
    for filename in os.listdir(path_to_raw_players_df):
        f = os.path.join(path_to_raw_players_df, filename)
        if os.path.isfile(f):
            players_df = pd.read_csv(f)
            list_raw_players_df.append(players_df)
    merged_players_df = pd.concat(list_raw_players_df)
    merged_players_df = merged_players_df.drop_duplicates(subset='player_id').reset_index()
    return merged_players_df

def load_csv_players_data_sofifa(path_to_sofifa_file):
    return pd.read_csv(path_to_sofifa_file)

In [279]:
# Comment it if players dataset already loaded
# collect_raw_all_players_df()

In [280]:
# Merge wyscout player datasets with sofifa datasets by matching string name
def create_maps_for_name_matching_scores(list_unique_names_df_1, list_unique_names_df_2):
    maps_name_matching_score = {}
    for name_1 in list_unique_names_df_1:
        for name_2 in list_unique_names_df_2:
            maps_name_matching_score[(name_1, name_2)] = fuzz.ratio(name_1, name_2)
    return maps_name_matching_score

def filter_out_maps_for_name_matching_scores(maps_name_matching, threshold):
    filtered_maps_name_matching = {}
    for name_1, name_2 in maps_name_matching:
        if (maps_name_matching[(name_1, name_2)] >= threshold):
            filtered_maps_name_matching[(name_1, name_2)] = maps_name_matching[(name_1, name_2)]
    return filtered_maps_name_matching

def merge_big_dataframe_wyscout_with_sofifa(big_dataframe_players, sofifa_players_dataset, maps_name_matching_score):
    # Preprocess both dataframes and add prefix 1- and 2- to all column names to avoid duplicate column names
    big_dataframe_players.dropna(subset=['player_name'], inplace=True)
    big_dataframe_players.rename(columns=lambda x: '1-'+x, inplace=True)
    sofifa_players_dataset.dropna(subset=['full_name'], inplace=True)
    sofifa_players_dataset.rename(columns=lambda x: '2-'+x, inplace=True)
    # Merge into new empty dataframe one by one by iterating maps name matching score
    big_dataframe_players_with_sofifa = pd.DataFrame(columns=list(big_dataframe_players.columns)+list(sofifa_players_dataset.columns), index=[0])
    big_dataframe_players_with_sofifa.reset_index(inplace=True)
    for name_1, name_2 in maps_name_matching_score:
        row_from_big_dataframe_players = big_dataframe_players[big_dataframe_players['1-player_name'] == name_1].iloc[0]
        row_from_sofifa_players_dataset = sofifa_players_dataset[sofifa_players_dataset['2-full_name'] == name_2].iloc[0]
        new_row = pd.concat([row_from_big_dataframe_players, row_from_sofifa_players_dataset], axis=0, ignore_index=False)
        new_row = pd.DataFrame([new_row]).reset_index()
        big_dataframe_players_with_sofifa = pd.concat([big_dataframe_players_with_sofifa, new_row])
    # Remove prefix 1- and 2- from final big datasets
    big_dataframe_players_with_sofifa.rename(columns=lambda x: x[2:], inplace=True)
    return big_dataframe_players_with_sofifa

DIRECTORY_PLAYERS_CSV_DATAS = "data/training_data_players_wyscout"
DIRECTORY_SOFIFA_CSV_DATAS = "data/players_skill_dataset/sofifa_dataset_cleaned.csv"
DIRECTORY_WYSCOUT_CSV_DATAS = "data/players_skill_dataset/wyscout_dataset_cleaned.csv"
DIRECTORY_FINAL_PLAYERS_CSV_DATAS = "data/players_skill_dataset/final_players_skill_dataset.csv"

# COMMENT BELOW SNIPPET CODES IF FINAL PLAYER DATASETS WITH SKILL ALREADY GENERATED !!
big_dataframe_players = load_and_concat_players_df_from_csv(DIRECTORY_PLAYERS_CSV_DATAS)
big_dataframe_players.to_csv(DIRECTORY_WYSCOUT_CSV_DATAS)
sofifa_players_dataset = load_csv_players_data_sofifa(DIRECTORY_SOFIFA_CSV_DATAS)

maps_name_matching_score = create_maps_for_name_matching_scores(big_dataframe_players['player_name'].unique(), sofifa_players_dataset['full_name'].unique())
maps_name_matching_score = filter_out_maps_for_name_matching_scores(maps_name_matching_score, threshold=80)

big_dataframe_players_with_sofifa = merge_big_dataframe_wyscout_with_sofifa(big_dataframe_players, sofifa_players_dataset, maps_name_matching_score)
big_dataframe_players_with_sofifa.reset_index(inplace=True)
big_dataframe_players_with_sofifa = big_dataframe_players_with_sofifa.drop_duplicates(subset='player_id')
big_dataframe_players_with_sofifa.to_csv(DIRECTORY_FINAL_PLAYERS_CSV_DATAS)
