In [16]:
from socceraction.data.wyscout import PublicWyscoutLoader
from socceraction.spadl.wyscout import convert_to_actions
from socceraction.data.opta import OptaLoader
from socceraction.data.statsbomb import StatsBombLoader
from socceraction.spadl.config import actiontypes, bodyparts
import socceraction.spadl as spadl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [17]:
api_wyscout = PublicWyscoutLoader(root="data/wyscout")
api_opta = OptaLoader(root="data/opta")
# api_statsbomb = StatsBombLoader(root="data/statsbomb")

In [18]:
for idx, action_name in enumerate(actiontypes):
    print(f'action_id : {idx}   action_name : {action_name}')

action_id : 0   action_name : pass
action_id : 1   action_name : cross
action_id : 2   action_name : throw_in
action_id : 3   action_name : freekick_crossed
action_id : 4   action_name : freekick_short
action_id : 5   action_name : corner_crossed
action_id : 6   action_name : corner_short
action_id : 7   action_name : take_on
action_id : 8   action_name : foul
action_id : 9   action_name : tackle
action_id : 10   action_name : interception
action_id : 11   action_name : shot
action_id : 12   action_name : shot_penalty
action_id : 13   action_name : shot_freekick
action_id : 14   action_name : keeper_save
action_id : 15   action_name : keeper_claim
action_id : 16   action_name : keeper_punch
action_id : 17   action_name : keeper_pick_up
action_id : 18   action_name : clearance
action_id : 19   action_name : bad_touch
action_id : 20   action_name : non_action
action_id : 21   action_name : dribble
action_id : 22   action_name : goalkick


In [19]:
for idx, bodypart_name in enumerate(bodyparts):
    print(f'bodypart_id : {idx}   bodypart_name : {bodypart_name}')

bodypart_id : 0   bodypart_name : foot
bodypart_id : 1   bodypart_name : head
bodypart_id : 2   bodypart_name : other
bodypart_id : 3   bodypart_name : head/other
bodypart_id : 4   bodypart_name : foot_left
bodypart_id : 5   bodypart_name : foot_right


In [20]:
def convert_events_df_to_spadl(events_df, home_team_id):
    spadl_events_df = convert_to_actions(events_df, home_team_id)
    spadl_events_df['time_seconds'] = spadl_events_df['time_seconds'].astype('float64')
    spadl_events_df['timestamp'] = pd.to_datetime(spadl_events_df['time_seconds'], unit='s')
    spadl_events_df = spadl.play_left_to_right(spadl_events_df, home_team_id)
    return spadl_events_df

In [21]:
# Collect all dataset action specific type, export them to csv files
# Pass (action_id = 0), Cross (action_id = 1)
PASS_ACTION_ID = [0,1] 

def collect_raw_pass_spadl_df(source="Wyscout", period=1):
    api = api_wyscout
    list_competitions_ids = []
    list_game_ids = []

    competitions_df = api.competitions()
    for _, row in competitions_df.iterrows():
        list_competitions_ids.append((row['competition_id'], row['season_id']))
        
    for competition_id, season_id in list_competitions_ids:
        games_df = api.games(competition_id, season_id)
        for _, row in games_df.iterrows():
            list_game_ids.append((row['game_id'], row['home_team_id'], row['away_team_id']))
            
    for game_id, home_team_id, away_team_id in list_game_ids:
        this_game_events_df = api.events(game_id)
        this_game_events_spadl_df = convert_events_df_to_spadl(this_game_events_df, home_team_id)
        this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['type_id'].isin(PASS_ACTION_ID)]
        if (period != None):
            this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['period_id'] == period]
        else:
            this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['period_id'] == 1]
        
        # Export to external csv iteratively
        this_game_events_spadl_df.to_csv(f'data/training_data_xpass_wyscout/{game_id}_{home_team_id}_{away_team_id}_xpass_data.csv')            

In [22]:
# MAIN DRIVER (comment it if csv files already loaded)
# collect_raw_pass_spadl_df()

In [23]:
# Load csv datas already retrieved then concat them into one big dataframe
import os

DIRECTORY_XPASS_CSV_DATAS = "data/training_data_xpass_wyscout"

def load_and_concat_xpass_df_from_csv():
    list_pass_event_df = []
    for filename in os.listdir(DIRECTORY_XPASS_CSV_DATAS):
        f = os.path.join(DIRECTORY_XPASS_CSV_DATAS, filename)
        if os.path.isfile(f):
            pass_event_df = pd.read_csv(f)
            list_pass_event_df.append(pass_event_df)
    return pd.concat(list_pass_event_df)

In [24]:
big_dataframe_xpass_model = load_and_concat_xpass_df_from_csv()

In [25]:
# SELECT ONLY FEATURED COLUMN FROM BIG DATASETS
features_column_included = ["start_x", "start_y", "end_x", "end_y", "bodypart_id", "result_id"]
big_dataframe_xpass_model = big_dataframe_xpass_model[[c for c in big_dataframe_xpass_model.columns if c in features_column_included]]
big_dataframe_xpass_model.head()

Unnamed: 0,start_x,start_y,end_x,end_y,bodypart_id,result_id
0,52.5,35.36,49.35,34.0,0,1
1,49.35,34.0,43.05,35.36,0,1
2,43.05,35.36,33.6,44.2,0,1
3,33.6,44.2,93.45,63.92,0,0
4,7.35,10.88,9.45,19.72,1,0


In [26]:
# FEATURE PREPROCESSING BIG DATASETS AND CREATE XGBOOST MODEL
# 1. Change start_x, start_y, end_x, end_y with StandardScaler
scaler = preprocessing.StandardScaler()
columns = ["start_x", "start_y", "end_x", "end_y"]
big_dataframe_xpass_model[columns] = scaler.fit_transform(big_dataframe_xpass_model[columns])

# 2. Check if data is unbalanced. If it is unbalanced, then do method to oversize the sample
print(big_dataframe_xpass_model['result_id'].value_counts())

# 3. Change result_id label into float64 type
big_dataframe_xpass_model['result_id'] = big_dataframe_xpass_model['result_id'].astype('float64')

# 4. Remove dataframe instead of having result_id (0,1) --> (fail, success)
big_dataframe_xpass_model = big_dataframe_xpass_model[big_dataframe_xpass_model['result_id'].isin([0,1])]
print(big_dataframe_xpass_model['result_id'].value_counts())

# 5. Split train data and test data from Big Datasets
all_feature_columns = ["start_x", "start_y", "end_x", "end_y", "bodypart_id"]
X_train = big_dataframe_xpass_model[all_feature_columns]
Y_train = big_dataframe_xpass_model["result_id"]
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, Y_train, test_size=0.4)

# 6. Train XGBoost Model 
modelXGB = XGBClassifier()
modelXGB.fit(X_train_split, y_train_split)

# 7. Predict Testing Data
y_predict = modelXGB.predict(X_test_split)

# 8. Display classification report
print(classification_report(y_test_split, y_predict))

1    719820
0    154235
2      3379
Name: result_id, dtype: int64
1.0    719820
0.0    154235
Name: result_id, dtype: int64
              precision    recall  f1-score   support

         0.0       0.74      0.38      0.50     61738
         1.0       0.88      0.97      0.92    287884

    accuracy                           0.87    349622
   macro avg       0.81      0.68      0.71    349622
weighted avg       0.86      0.87      0.85    349622

