In [15]:
from socceraction.data.wyscout import PublicWyscoutLoader
from socceraction.spadl.wyscout import convert_to_actions
from socceraction.data.opta import OptaLoader
from socceraction.data.statsbomb import StatsBombLoader
from socceraction.spadl.config import actiontypes, bodyparts
import socceraction.spadl as spadl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, brier_score_loss, log_loss, mean_absolute_error, r2_score, mean_absolute_percentage_error
from xgboost import XGBClassifier, XGBRegressor
import math
import pickle
import os
from name_matching.name_matcher import NameMatcher
from rapidfuzz import fuzz
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import r_regression, SelectKBest, chi2, mutual_info_classif, SequentialFeatureSelector, RFECV, SelectFromModel
from scipy.stats import pearsonr, chisquare
from mrmr import mrmr_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso, LogisticRegression

In [16]:
# CONFIG FOR EXPERIMENTS SCENARIO
INCLUDE_SKILL_PLAYERS_OPTIONS = [
    False,
    True
]
SAMPLING_OPTIONS = [
    "none",
    "random_oversampled",
    "random_undersampled",
    "smote_oversampled"
]
FEATURE_SELECTION_OPTIONS = [
    "chisquare",
    "mutualinf",
    "mrmr",
    "rfembedded",
    "lasso"
]
MODEL_ALGORITHM_OPTIONS = [
    "xgbregressor",
    "rfregressor",
    "logregression"
]
CONFIG_EXPERIMENTS_SCENARIO_MAP = {}

def construct_config_experiments_scenario_map():
    index_counter = 1
    for include_skill_opt in INCLUDE_SKILL_PLAYERS_OPTIONS:
        for sampling_opt in SAMPLING_OPTIONS:
            if (include_skill_opt == False):
                for algorithm_opt in MODEL_ALGORITHM_OPTIONS:
                    CONFIG_EXPERIMENTS_SCENARIO_MAP[index_counter] = \
                        {"include_skill_opt" : 1 if include_skill_opt else 0, \
                        "sampling_opt" : sampling_opt, \
                        "feature_selection_opt" : "none", \
                        "algorithm_opt" : algorithm_opt}
                    index_counter += 1
            else:
                for feature_selection_opt in FEATURE_SELECTION_OPTIONS:
                    for algorithm_opt in MODEL_ALGORITHM_OPTIONS:
                        CONFIG_EXPERIMENTS_SCENARIO_MAP[index_counter] = \
                            {"include_skill_opt" : 1 if include_skill_opt else 0, \
                            "sampling_opt" : sampling_opt, \
                            "feature_selection_opt" : feature_selection_opt, \
                            "algorithm_opt" : algorithm_opt}
                        index_counter += 1

construct_config_experiments_scenario_map()

In [17]:
# COLUMNS FOR TEST EXPERIMENT RESULT
COLUMNS_EVALUATION_METRIC = [
    "mean_squared_error_score",
    "root_mean_squared_error_score",
    "auc_score",
    "brier_score",
    "log_loss_score",
    "mean_absolute_error_score",
    "r_squared_score",
    "mean_absolute_percentage_error_score"
]
COLUMNS_SCENARIO_NAME = [
    "include_skill_opt",
    "sampling_opt",
    "feature_selection_opt",
    "algorithm_opt"
]
COLUMNS_EXPERIMENT_RESULT = ["case_number"] + COLUMNS_SCENARIO_NAME + COLUMNS_EVALUATION_METRIC

In [18]:
api_wyscout = PublicWyscoutLoader(root="data/wyscout")
api_opta = OptaLoader(root="data/opta")
# api_statsbomb = StatsBombLoader(root="data/statsbomb")

In [19]:
for idx, action_name in enumerate(actiontypes):
    print(f'action_id : {idx}   action_name : {action_name}')

action_id : 0   action_name : pass
action_id : 1   action_name : cross
action_id : 2   action_name : throw_in
action_id : 3   action_name : freekick_crossed
action_id : 4   action_name : freekick_short
action_id : 5   action_name : corner_crossed
action_id : 6   action_name : corner_short
action_id : 7   action_name : take_on
action_id : 8   action_name : foul
action_id : 9   action_name : tackle
action_id : 10   action_name : interception
action_id : 11   action_name : shot
action_id : 12   action_name : shot_penalty
action_id : 13   action_name : shot_freekick
action_id : 14   action_name : keeper_save
action_id : 15   action_name : keeper_claim
action_id : 16   action_name : keeper_punch
action_id : 17   action_name : keeper_pick_up
action_id : 18   action_name : clearance
action_id : 19   action_name : bad_touch
action_id : 20   action_name : non_action
action_id : 21   action_name : dribble
action_id : 22   action_name : goalkick


In [20]:
for idx, bodypart_name in enumerate(bodyparts):
    print(f'bodypart_id : {idx}   bodypart_name : {bodypart_name}')

bodypart_id : 0   bodypart_name : foot
bodypart_id : 1   bodypart_name : head
bodypart_id : 2   bodypart_name : other
bodypart_id : 3   bodypart_name : head/other
bodypart_id : 4   bodypart_name : foot_left
bodypart_id : 5   bodypart_name : foot_right


In [21]:
def convert_events_df_to_spadl(events_df, home_team_id):
    spadl_events_df = convert_to_actions(events_df, home_team_id)
    spadl_events_df['time_seconds'] = spadl_events_df['time_seconds'].astype('float64')
    spadl_events_df['timestamp'] = pd.to_datetime(spadl_events_df['time_seconds'], unit='s')
    spadl_events_df = spadl.play_left_to_right(spadl_events_df, home_team_id)
    return spadl_events_df

In [22]:
# FUNCTION TO ADD ADDITIONAL INFO IN RAW SPADL DATAFRAME
STANDARD_LENGTH_COURT = 105
STANDARD_WIDTH_COURT = 68
STANDARD_GOALLINE_WIDTH = 7.32

# Helper Functions
def calculate_distance_to_goal(length_court, width_court, coordinate_x, coordinate_y, is_home_team):
    if is_home_team:
        distance_to_goal = math.sqrt((abs(length_court - coordinate_x)) ** 2 + (abs((0.5 * width_court) - coordinate_y)) ** 2)
    else:
        distance_to_goal = math.sqrt((coordinate_x) ** 2 + (abs((0.5 * width_court) - coordinate_y)) ** 2)
    return distance_to_goal

def calculate_distance_to_goal_apply_df(row):
    return calculate_distance_to_goal(STANDARD_LENGTH_COURT, STANDARD_WIDTH_COURT, row['start_x'], row['start_y'], row['is_home_team'])

# def calculate_angle_to_goal(goalline_width, length_court, width_court, coordinate_x, coordinate_y, is_home_team):
#     if is_home_team:
#         L = abs(length_court - coordinate_x)
#     else:
#         L = coordinate_x
#     W = abs((0.5 * width_court) - coordinate_y)
#     return math.atan((goalline_width * L) / (L ** 2 + W ** 2 - (goalline_width / 2) ** 2))

def calculate_angle_to_goal_v2(goalline_width, length_court, width_court, coordinate_x, coordinate_y, is_home_team):
    if is_home_team:
        coordinate_x_post_1, coordinate_x_post_2 = (length_court, length_court)
    else:
        coordinate_x_post_1, coordinate_x_post_2 = (0, 0)
    coordinate_y_post_1 = (width_court / 2) + (goalline_width / 2)
    coordinate_y_post_2 = (width_court / 2) - (goalline_width / 2)

    distance_to_post_1 = math.sqrt(abs(coordinate_x - coordinate_x_post_1) ** 2 + abs(coordinate_y - coordinate_y_post_1) ** 2)
    distance_to_post_2 = math.sqrt(abs(coordinate_x - coordinate_x_post_2) ** 2 + abs(coordinate_y - coordinate_y_post_2) ** 2)

    return math.acos((distance_to_post_1 ** 2 + distance_to_post_2 ** 2 - goalline_width ** 2) / (2 * distance_to_post_1 * distance_to_post_2))

def calculate_angle_to_goal_apply_df(row):
    return calculate_angle_to_goal_v2(STANDARD_GOALLINE_WIDTH, STANDARD_LENGTH_COURT, STANDARD_WIDTH_COURT, row['start_x'], row['start_y'], row['is_home_team'])

def filter_out_is_home_team_apply_df(row, home_team_id):
    return 1 if row['team_id'] == home_team_id else 0

# Add distance to goal column
def add_distance_to_goal_column_to_spadl_df(spadl_df):
    spadl_df['distance_to_goal'] = spadl_df.apply(calculate_distance_to_goal_apply_df, axis=1)
    return spadl_df

# Add angle to goal column 
def add_angle_to_goal_column_to_spadl_df(spadl_df):
    spadl_df['angle_to_goal'] = spadl_df.apply(calculate_angle_to_goal_apply_df, axis=1)
    return spadl_df

# Add is_home_team column (boolean 0/1)
def add_is_home_team_column_to_spadl_df(spadl_df, home_team_id):
    spadl_df['is_home_team'] = spadl_df.apply(lambda x : filter_out_is_home_team_apply_df(x, home_team_id), axis=1)
    return spadl_df

In [23]:
# Collect all dataset action specific type, export them to csv files
# Shot (action_id = 11), shot_penalty (action_id = 12), shot_freekick (action_id = 13)
SHOT_ACTION_ID = [11, 12, 13] 

def collect_raw_goal_spadl_df(source="Wyscout", period=1):
    api = api_wyscout
    list_competitions_ids = []
    list_game_ids = []

    competitions_df = api.competitions()
    for _, row in competitions_df.iterrows():
        list_competitions_ids.append((row['competition_id'], row['season_id']))
        
    for competition_id, season_id in list_competitions_ids:
        games_df = api.games(competition_id, season_id)
        for _, row in games_df.iterrows():
            list_game_ids.append((row['game_id'], row['home_team_id'], row['away_team_id']))
            
    for game_id, home_team_id, away_team_id in list_game_ids:
        this_game_events_df = api.events(game_id)
        this_game_events_spadl_df = convert_events_df_to_spadl(this_game_events_df, home_team_id)
        this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['type_id'].isin(SHOT_ACTION_ID)]
        if (period != None):
            this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['period_id'] == period]
        else:
            this_game_events_spadl_df = this_game_events_spadl_df[this_game_events_spadl_df['period_id'] == 1]
        # Add additional computed column to support xG model
        this_game_events_spadl_df = add_is_home_team_column_to_spadl_df(this_game_events_spadl_df, home_team_id)
        this_game_events_spadl_df = add_distance_to_goal_column_to_spadl_df(this_game_events_spadl_df)
        this_game_events_spadl_df = add_angle_to_goal_column_to_spadl_df(this_game_events_spadl_df)

        # Export to external csv iteratively
        this_game_events_spadl_df.to_csv(f'data/training_data_xgoal_wyscout/{game_id}_{home_team_id}_{away_team_id}_xgoal_data.csv')            

In [24]:
# MAIN DRIVER (comment it if csv files already loaded)
# collect_raw_goal_spadl_df()

In [25]:
# Load csv datas already retrieved then concat them into one big dataframe
import os

DIRECTORY_XGOAL_CSV_DATAS = "data/training_data_xgoal_wyscout"

def load_and_concat_xgoal_df_from_csv():
    list_pass_event_df = []
    for filename in os.listdir(DIRECTORY_XGOAL_CSV_DATAS):
        f = os.path.join(DIRECTORY_XGOAL_CSV_DATAS, filename)
        if os.path.isfile(f):
            pass_event_df = pd.read_csv(f)
            list_pass_event_df.append(pass_event_df)
    return pd.concat(list_pass_event_df)

In [26]:
# JOIN ALREADY CONSTRUCTED PLAYER SKILLS DATASET WITH ORIGIN EVENT DATASET WYSCOUT
DIRECTORY_FINAL_PLAYERS_CSV_DATAS = "data/players_skill_dataset/final_players_skill_dataset.csv"

player_skills_dataset = pd.read_csv(DIRECTORY_FINAL_PLAYERS_CSV_DATAS)
big_dataframe_xgoal_model = load_and_concat_xgoal_df_from_csv()
big_dataframe_xgoal_model = big_dataframe_xgoal_model.merge(player_skills_dataset, how='inner',on='player_id')
big_dataframe_xgoal_model.head()

Unnamed: 0,Unnamed: 0_x,game_id_x,period_id,time_seconds,team_id_x,player_id,start_x,start_y,end_x,end_y,...,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
0,7,1694390,1,31.226217,4418,25437,95.55,48.28,105.0,34.0,...,83+3,83+3,83+3,83+3,83+3,82+3,81+3,81+3,81+3,82+3
1,567,1694437,1,2283.022773,4418,25437,87.15,30.6,105.0,27.2,...,83+3,83+3,83+3,83+3,83+3,82+3,81+3,81+3,81+3,82+3
2,297,2058014,1,1066.257878,4418,25437,76.65,40.12,105.0,34.0,...,83+3,83+3,83+3,83+3,83+3,82+3,81+3,81+3,81+3,82+3
3,602,2058014,1,2471.924828,4418,25437,94.5,48.28,105.0,40.8,...,83+3,83+3,83+3,83+3,83+3,82+3,81+3,81+3,81+3,82+3
4,522,2575985,1,2189.374798,3159,25437,84.0,46.24,105.0,27.2,...,83+3,83+3,83+3,83+3,83+3,82+3,81+3,81+3,81+3,82+3


In [27]:
# SELECT ONLY FEATURED COLUMN FROM BIG DATASETS
features_column_included = ["distance_to_goal", "angle_to_goal", "is_home_team", "result_id"]
player_skills_column_included = ["acceleration", "aggression", "agility", "balance", "ball_control",
                                 "composure", "crossing", "curve", "dribbling", "finishing",
                                 "freekick_accuracy", "heading_accuracy", "interceptions", "jumping", "long_passing",
                                 "long_shots", "marking", "penalties", "positioning", "reactions",
                                 "shot_power", "sliding_tackle", "sprint_speed", "stamina", "short_passing",
                                 "standing_tackle", "strength", "vision", "volleys"]
player_attribute_column_included = ["height_cm", "weight_kgs", "age"]

big_dataframe_xgoal_model = big_dataframe_xgoal_model[[c for c in big_dataframe_xgoal_model.columns if c in (features_column_included + player_skills_column_included + player_attribute_column_included)]]
big_dataframe_xgoal_model.head()

Unnamed: 0,result_id,is_home_team,distance_to_goal,angle_to_goal,age,height_cm,weight_kgs,crossing,finishing,heading_accuracy,...,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle
0,0,1,17.123694,0.242346,31.0,154.94,74.8,75.0,64.0,68.0,...,70.0,86.0,86.0,72.0,76.0,56.0,82.0,85.0,84.0,86.0
1,0,1,18.170925,0.391202,31.0,154.94,74.8,75.0,64.0,68.0,...,70.0,86.0,86.0,72.0,76.0,56.0,82.0,85.0,84.0,86.0
2,0,1,29.00305,0.245634,31.0,154.94,74.8,75.0,64.0,68.0,...,70.0,86.0,86.0,72.0,76.0,56.0,82.0,85.0,84.0,86.0
3,0,1,17.724796,0.250188,31.0,154.94,74.8,75.0,64.0,68.0,...,70.0,86.0,86.0,72.0,76.0,56.0,82.0,85.0,84.0,86.0
4,0,1,24.30674,0.260183,31.0,154.94,74.8,75.0,64.0,68.0,...,70.0,86.0,86.0,72.0,76.0,56.0,82.0,85.0,84.0,86.0


In [28]:
# CASE 1 : Random Oversample Function
def training_data_random_oversampled(X_train, Y_train):
    ros = RandomOverSampler(random_state=0)
    X_resampled, Y_resampled = ros.fit_resample(X_train, Y_train)
    return (X_resampled, Y_resampled)

# CASE 2 : Random Undersample Function
def training_data_random_undersampled(X_train, Y_train):
    rus = RandomUnderSampler(random_state=0)
    X_resampled, Y_resampled = rus.fit_resample(X_train, Y_train)
    return (X_resampled, Y_resampled)

# CASE 3 : Random SMOTE Oversample Function
def training_data_smote_oversampled(X_train, Y_train):
    X_resampled, Y_resampled = SMOTE().fit_resample(X_train, Y_train)
    return (X_resampled, Y_resampled)

# X CASE 1 : Feature Selection - Pearson Coefficient
def filter_columns_feature_selection_pearson(X_train, Y_train, columns_considered, threshold):
    new_columns_after_selection = []
    for _, skill in enumerate(columns_considered):
        correlation_value, _ = pearsonr(X_train[skill], Y_train)
        if correlation_value >= threshold:
            new_columns_after_selection.append(skill)
    return new_columns_after_selection

def training_data_feature_selection_pearson(X_train, Y_train, columns_considered, threshold):
    columns_filtered = filter_columns_feature_selection_pearson(X_train, Y_train, columns_considered, threshold)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# V CASE 2 : Feature Selection - Chi Square
def filter_columns_feature_selection_chisquare(X_train, Y_train, columns_considered, num_of_features):
    chi2_selector = SelectKBest(chi2, k=num_of_features) 
    df_feature = X_train[columns_considered]
    chi2_selector.fit(df_feature, Y_train)
    cols = chi2_selector.get_support(indices=True)
    df_selected_features = df_feature.iloc[:,cols]
    return df_selected_features.columns

def training_data_feature_selection_chisquare(X_train, Y_train, columns_considered, num_of_features):
    columns_filtered = filter_columns_feature_selection_chisquare(X_train, Y_train, columns_considered, num_of_features)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# V CASE 3 : Feature Selection - Mutual Information
def filter_columns_feature_selection_mutualinf(X_train, Y_train, columns_considered, num_of_features):
    mi_selector = SelectKBest(mutual_info_classif, k=num_of_features) 
    df_feature = X_train[columns_considered]
    mi_selector.fit(df_feature, Y_train)
    cols = mi_selector.get_support(indices=True)
    df_selected_features = df_feature.iloc[:,cols]
    return df_selected_features.columns

def training_data_feature_selection_mutualinf(X_train, Y_train, columns_considered, num_of_features):
    columns_filtered = filter_columns_feature_selection_mutualinf(X_train, Y_train, columns_considered, num_of_features)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# V CASE 4 : Feature Selection - mRMR Selection
def filter_columns_feature_selection_mrmr(X_train, Y_train, columns_considered, num_of_features):
    df_feature = X_train[columns_considered]
    selected_features = mrmr_classif(X=df_feature, y=Y_train, K=num_of_features)
    return selected_features

def training_data_feature_selection_mrmr(X_train, Y_train, columns_considered, num_of_features):
    columns_filtered = filter_columns_feature_selection_mrmr(X_train, Y_train, columns_considered, num_of_features)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# X CASE 5 : Feature Selection - Sequential Forward Selection (SFS)
def filter_columns_feature_selection_sfs(X_train, Y_train, columns_considered, num_of_features):
    rf = RandomForestClassifier()
    sfs = SequentialFeatureSelector(rf, n_features_to_select=num_of_features, direction='forward')
    df_feature = X_train[columns_considered]
    sfs.fit(df_feature, Y_train)
    cols = sfs.get_support(indices=True)
    df_selected_features = df_feature.iloc[:,cols]
    return df_selected_features.columns

def training_data_feature_selection_sfs(X_train, Y_train, columns_considered, num_of_features):
    columns_filtered = filter_columns_feature_selection_sfs(X_train, Y_train, columns_considered, num_of_features)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# X CASE 6 : Feature Selection - Sequential Backward Elimination (SBE)
def filter_columns_feature_selection_sbe(X_train, Y_train, columns_considered, num_of_features):
    rf = RandomForestClassifier()
    sfs = SequentialFeatureSelector(rf, n_features_to_select=num_of_features, direction='backward')
    df_feature = X_train[columns_considered]
    sfs.fit(df_feature, Y_train)
    cols = sfs.get_support(indices=True)
    df_selected_features = df_feature.iloc[:,cols]
    return df_selected_features.columns

def training_data_feature_selection_sbe(X_train, Y_train, columns_considered, num_of_features):
    columns_filtered = filter_columns_feature_selection_sbe(X_train, Y_train, columns_considered, num_of_features)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# X CASE 7 : Feature Selection - Recursive Feature Elimination
def filter_columns_feature_selection_rfe(X_train, Y_train, columns_considered, num_of_features):
    estimator = LinearSVR()
    selector = RFECV(estimator, step=1, cv=num_of_features)
    df_feature = X_train[columns_considered]
    selector.fit(df_feature, Y_train)
    cols = selector.get_support(indices=True)
    df_selected_features = df_feature.iloc[:,cols]
    return df_selected_features.columns

def training_data_feature_selection_rfe(X_train, Y_train, columns_considered, num_of_features):
    columns_filtered = filter_columns_feature_selection_rfe(X_train, Y_train, columns_considered, num_of_features)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# V CASE 8 : Feature Selection - Random Forest Embedded (rfembedded)
def filter_columns_feature_selection_rfembedded(X_train, Y_train, columns_considered, num_of_features):
    estimator = RandomForestClassifier()
    selector = SelectFromModel(estimator=estimator, max_features=num_of_features)
    df_feature = X_train[columns_considered]
    selector.fit(df_feature, Y_train)
    cols = selector.get_support(indices=True)
    df_selected_features = df_feature.iloc[:,cols]
    return df_selected_features.columns

def training_data_feature_selection_rfembedded(X_train, Y_train, columns_considered, num_of_features):
    columns_filtered = filter_columns_feature_selection_rfembedded(X_train, Y_train, columns_considered, num_of_features)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# V CASE 9 : Feature Selection - LASSO
def filter_columns_feature_selection_lasso(X_train, Y_train, columns_considered, num_of_features):
    estimator = LogisticRegression(penalty='l2', C=0.5, solver='newton-cholesky')
    selector = SelectFromModel(estimator=estimator, max_features=num_of_features)
    df_feature = X_train[columns_considered]
    selector.fit(df_feature, Y_train)
    cols = selector.get_support(indices=True)
    df_selected_features = df_feature.iloc[:,cols]
    return df_selected_features.columns

def training_data_feature_selection_lasso(X_train, Y_train, columns_considered, num_of_features):
    columns_filtered = filter_columns_feature_selection_lasso(X_train, Y_train, columns_considered, num_of_features)
    print(columns_filtered)
    return (X_train[columns_filtered], Y_train)

# CASE 1 : Train with model XGBRegressor
def fit_and_train_with_model_xgbregressor(X_train, Y_train):
    model = XGBRegressor(objective="reg:logistic")
    model.fit(X_train, Y_train)
    return model

# CASE 2 : Train with model RandomForestRegressor
def fit_and_train_with_model_rfregressor(X_train, Y_train):
    model = RandomForestRegressor()
    model.fit(X_train, Y_train)
    return model

# CASE 3 : Train with model LogisticRegression
def fit_and_train_with_model_logregression(X_train, Y_train):
    model = LogisticRegression()
    model.fit(X_train, Y_train)
    return model

In [29]:
# FEATURE PREPROCESSING BIG DATASETS AND CREATE XGBOOST MODEL
# 1. Change all numeric columns with MinMaxScaler
scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
columns_minmax_scaler = player_skills_column_included + player_attribute_column_included + ["distance_to_goal", "angle_to_goal"]
big_dataframe_xgoal_model[columns_minmax_scaler] = scaler.fit_transform(big_dataframe_xgoal_model[columns_minmax_scaler])

# 2. Check if data is unbalanced. If it is unbalanced, then do method to oversize the sample
print(big_dataframe_xgoal_model['result_id'].value_counts())

# 3. Change result_id label into float64 type
big_dataframe_xgoal_model['result_id'] = big_dataframe_xgoal_model['result_id'].astype('float64')

# 4. Remove dataframe instead of having result_id (0,1) --> (fail, success)
big_dataframe_xgoal_model = big_dataframe_xgoal_model[big_dataframe_xgoal_model['result_id'].isin([0,1])]
print(big_dataframe_xgoal_model['result_id'].value_counts())

# 5. Split train data and test data from Big Datasets
all_feature_columns = columns_minmax_scaler + ["is_home_team"]
X_train = big_dataframe_xgoal_model[all_feature_columns]
Y_train = big_dataframe_xgoal_model["result_id"]

# Empty dataframe for saving test result
empty_test_result = pd.DataFrame(columns=COLUMNS_EXPERIMENT_RESULT, index=[0])

for case_number in sorted(list(CONFIG_EXPERIMENTS_SCENARIO_MAP.keys())):
    include_skill_opt = CONFIG_EXPERIMENTS_SCENARIO_MAP[case_number]["include_skill_opt"]
    sampling_opt = CONFIG_EXPERIMENTS_SCENARIO_MAP[case_number]["sampling_opt"]
    feature_selection_opt = CONFIG_EXPERIMENTS_SCENARIO_MAP[case_number]["feature_selection_opt"]
    algorithm_opt = CONFIG_EXPERIMENTS_SCENARIO_MAP[case_number]["algorithm_opt"]

    # 6. Filter out all player skills and attributes column if not include skill option
    if include_skill_opt == 0:
        only_featured_column = [column for column in features_column_included if column != 'result_id']
        X_train_filtered = X_train[only_featured_column]
    else:
        X_train_filtered = X_train

    # 7. Do oversampling/undersampling and feature selection at same time
    if sampling_opt == "none":
        X_resampled, Y_resampled = X_train_filtered, Y_train
    else:
        X_resampled, Y_resampled = globals()["training_data_" + sampling_opt](X_train_filtered, Y_train)
    if feature_selection_opt == "none":
        X_feature_sel, Y_feature_sel = X_resampled, Y_resampled
    else:
        X_feature_sel, Y_feature_sel = globals()["training_data_feature_selection_" + feature_selection_opt](X_resampled, Y_resampled, player_skills_column_included, 10)        

    # 8. Do train_test_split on training data
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_feature_sel, Y_feature_sel, test_size=0.2, random_state=42)

    # 9. Train Model
    model = globals()["fit_and_train_with_model_" + algorithm_opt](X_train_split, y_train_split)

    # 10. Predict Testing Data
    y_predict = model.predict(X_test_split)

    # 11. Save test result experiment
    mean_squared_error_score = mean_squared_error(y_test_split, y_predict)
    root_mean_squared_error_score = mean_squared_error(y_test_split, y_predict, squared=False)
    auc_score = roc_auc_score(y_test_split, y_predict)
    brier_score = brier_score_loss(y_test_split, y_predict)
    log_loss_score = log_loss(y_test_split, y_predict)
    mean_absolute_error_score = mean_absolute_error(y_test_split, y_predict)
    r_squared_score = r2_score(y_test_split, y_predict)
    mean_absolute_percentage_error_score = mean_absolute_percentage_error(y_test_split, y_predict)

    maps_new_row = {}
    for column in COLUMNS_EXPERIMENT_RESULT:
        if column not in COLUMNS_EVALUATION_METRIC:
            if column == "case_number":
                maps_new_row["case_number"] = case_number
            elif column in COLUMNS_SCENARIO_NAME:
                maps_new_row[column] = globals()[column]
        else:
            maps_new_row[column] = globals()[column]     
    new_row = pd.DataFrame(maps_new_row, index=[0])
    empty_test_result = pd.concat([new_row, empty_test_result.loc[:]]).reset_index(drop=True)

    # 12. Save model to external file
    filename = f'xgoal_model_case_{case_number}.sav'
    directory_model = "data/model_xgoal_wyscout/"
    pickle.dump(model, open(directory_model + filename, 'wb'))

# 13. Save test result experiment to external file
filename = 'xgoal_test_model_experiment_result.csv'
directory_model = "data/model_xgoal_wyscout/"
empty_test_result.to_csv(directory_model + filename)


0    17025
1     1957
Name: result_id, dtype: int64
0.0    17025
1.0     1957
Name: result_id, dtype: int64
Index(['finishing', 'heading_accuracy', 'interceptions', 'marking',
       'penalties', 'positioning', 'reactions', 'sliding_tackle',
       'standing_tackle', 'volleys'],
      dtype='object')
Index(['finishing', 'heading_accuracy', 'interceptions', 'marking',
       'penalties', 'positioning', 'reactions', 'sliding_tackle',
       'standing_tackle', 'volleys'],
      dtype='object')
Index(['finishing', 'heading_accuracy', 'interceptions', 'marking',
       'penalties', 'positioning', 'reactions', 'sliding_tackle',
       'standing_tackle', 'volleys'],
      dtype='object')
Index(['acceleration', 'ball_control', 'composure', 'finishing',
       'heading_accuracy', 'penalties', 'reactions', 'sliding_tackle',
       'standing_tackle', 'vision'],
      dtype='object')
Index(['finishing', 'freekick_accuracy', 'heading_accuracy', 'long_shots',
       'marking', 'penalties', 'position

100%|██████████| 10/10 [00:00<00:00, 33.45it/s]


['finishing', 'jumping', 'standing_tackle', 'reactions', 'sliding_tackle', 'penalties', 'heading_accuracy', 'interceptions', 'volleys', 'marking']


100%|██████████| 10/10 [00:00<00:00, 35.02it/s]


['finishing', 'jumping', 'standing_tackle', 'reactions', 'sliding_tackle', 'penalties', 'heading_accuracy', 'interceptions', 'volleys', 'marking']


100%|██████████| 10/10 [00:00<00:00, 39.34it/s]


['finishing', 'jumping', 'standing_tackle', 'reactions', 'sliding_tackle', 'penalties', 'heading_accuracy', 'interceptions', 'volleys', 'marking']
Index(['balance', 'finishing', 'freekick_accuracy', 'heading_accuracy',
       'interceptions', 'marking', 'penalties', 'positioning',
       'sliding_tackle', 'standing_tackle'],
      dtype='object')
Index(['balance', 'finishing', 'freekick_accuracy', 'heading_accuracy',
       'interceptions', 'jumping', 'marking', 'penalties', 'positioning',
       'volleys'],
      dtype='object')
Index(['finishing', 'freekick_accuracy', 'heading_accuracy', 'interceptions',
       'jumping', 'marking', 'penalties', 'positioning', 'standing_tackle',
       'volleys'],
      dtype='object')
Index(['acceleration', 'agility', 'composure', 'crossing', 'finishing',
       'heading_accuracy', 'long_shots', 'positioning', 'reactions',
       'stamina'],
      dtype='object')
Index(['acceleration', 'agility', 'composure', 'crossing', 'finishing',
       'heading

100%|██████████| 10/10 [00:00<00:00, 31.92it/s]


['finishing', 'strength', 'standing_tackle', 'reactions', 'sliding_tackle', 'volleys', 'interceptions', 'composure', 'heading_accuracy', 'marking']


100%|██████████| 10/10 [00:00<00:00, 27.39it/s]


['finishing', 'strength', 'standing_tackle', 'reactions', 'sliding_tackle', 'volleys', 'interceptions', 'composure', 'heading_accuracy', 'marking']


100%|██████████| 10/10 [00:00<00:00, 28.75it/s]


['finishing', 'strength', 'standing_tackle', 'reactions', 'sliding_tackle', 'volleys', 'interceptions', 'composure', 'heading_accuracy', 'marking']
Index(['balance', 'finishing', 'heading_accuracy', 'interceptions', 'jumping',
       'marking', 'penalties', 'positioning', 'reactions', 'shot_power'],
      dtype='object')
Index(['balance', 'finishing', 'heading_accuracy', 'interceptions', 'jumping',
       'marking', 'penalties', 'positioning', 'reactions', 'shot_power'],
      dtype='object')
Index(['balance', 'composure', 'finishing', 'heading_accuracy', 'marking',
       'penalties', 'positioning', 'reactions', 'shot_power',
       'standing_tackle'],
      dtype='object')
Index(['acceleration', 'agility', 'balance', 'ball_control', 'composure',
       'finishing', 'heading_accuracy', 'long_shots', 'positioning',
       'reactions'],
      dtype='object')
Index(['acceleration', 'agility', 'balance', 'ball_control', 'composure',
       'finishing', 'heading_accuracy', 'long_shots', 'p

100%|██████████| 10/10 [00:00<00:00, 39.63it/s]


['finishing', 'strength', 'marking', 'reactions', 'standing_tackle', 'heading_accuracy', 'volleys', 'interceptions', 'penalties', 'sliding_tackle']


100%|██████████| 10/10 [00:00<00:00, 36.70it/s]


['finishing', 'strength', 'marking', 'reactions', 'standing_tackle', 'heading_accuracy', 'volleys', 'interceptions', 'penalties', 'sliding_tackle']


100%|██████████| 10/10 [00:00<00:00, 40.88it/s]


['finishing', 'strength', 'marking', 'reactions', 'standing_tackle', 'heading_accuracy', 'volleys', 'interceptions', 'penalties', 'sliding_tackle']
Index(['finishing', 'heading_accuracy', 'interceptions', 'jumping', 'marking',
       'penalties', 'positioning', 'sliding_tackle', 'stamina', 'strength'],
      dtype='object')
Index(['finishing', 'freekick_accuracy', 'heading_accuracy', 'interceptions',
       'jumping', 'marking', 'penalties', 'positioning', 'strength',
       'volleys'],
      dtype='object')
Index(['balance', 'finishing', 'freekick_accuracy', 'heading_accuracy',
       'jumping', 'marking', 'penalties', 'stamina', 'strength', 'volleys'],
      dtype='object')
Index(['acceleration', 'agility', 'crossing', 'finishing', 'heading_accuracy',
       'marking', 'positioning', 'reactions', 'sliding_tackle', 'stamina'],
      dtype='object')
Index(['acceleration', 'agility', 'crossing', 'finishing', 'heading_accuracy',
       'marking', 'positioning', 'reactions', 'sliding_tack

100%|██████████| 10/10 [00:00<00:00, 24.13it/s]


['finishing', 'strength', 'standing_tackle', 'reactions', 'sliding_tackle', 'volleys', 'heading_accuracy', 'interceptions', 'penalties', 'composure']


100%|██████████| 10/10 [00:00<00:00, 22.93it/s]


['finishing', 'strength', 'interceptions', 'reactions', 'penalties', 'standing_tackle', 'heading_accuracy', 'volleys', 'sliding_tackle', 'composure']


100%|██████████| 10/10 [00:00<00:00, 25.07it/s]


['finishing', 'strength', 'marking', 'reactions', 'volleys', 'standing_tackle', 'penalties', 'sliding_tackle', 'composure', 'heading_accuracy']
Index(['composure', 'finishing', 'marking', 'penalties', 'positioning',
       'reactions', 'shot_power', 'stamina', 'volleys'],
      dtype='object')
Index(['composure', 'finishing', 'heading_accuracy', 'marking', 'penalties',
       'positioning', 'reactions', 'shot_power', 'stamina', 'volleys'],
      dtype='object')
Index(['composure', 'finishing', 'jumping', 'penalties', 'positioning',
       'reactions', 'shot_power', 'stamina', 'volleys'],
      dtype='object')
Index(['acceleration', 'agility', 'ball_control', 'composure', 'finishing',
       'heading_accuracy', 'long_shots', 'positioning', 'reactions',
       'sliding_tackle'],
      dtype='object')
Index(['acceleration', 'ball_control', 'composure', 'finishing',
       'heading_accuracy', 'long_shots', 'positioning', 'reactions',
       'sliding_tackle', 'stamina'],
      dtype='object