In [11]:
import pandas as pd

from scipy import stats
import math
import os

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [12]:
def read_datasets(folder_path, dataset_types):
    dfs = {}
    
    for dataset_type in dataset_types:
        file_path = f'{folder_path}\\{dataset_type}\\{dataset_type}_325.csv'
        dfs[dataset_type] = pd.read_csv(file_path)
    
    return dfs

def drop_columns(df, columns):
    return df.drop(columns, axis=1)

In [13]:
# Merge all DataFrames on 'player.id'
def merge_dataframes(dfs, on_column):
    merged_df = dfs[0]
    
    for df in dfs[1:]:
        merged_df = pd.merge(merged_df, df, on=on_column, how='left')
    
    return merged_df

def get_params_list(merged_df: pd.DataFrame, cols_filter):
    params_df = merged_df[cols_filter]
    params = list(params_df.columns)
    return params

def get_player(player_id, merged_df, cols_filter):
    player = merged_df.loc[merged_df['player.id'] == player_id].reset_index()
    player = player[cols_filter]
    return list(player.loc[0])

def calculate_percentiles(params, merged_df, player):
    values = []
    for x in range(len(params)):
        values.append(math.floor(stats.percentileofscore(merged_df[params[x]], player[x])))
    return values

In [15]:

BASE_DIR = 'data'
ENGINEERED_DIR = os.path.join(BASE_DIR, 'engineered')

folder_path = os.path.join(ENGINEERED_DIR,'325')
dataset_types = ['attack', 'defense', 'keepers', 'passing', 'others', 'players_info', 'teams_info']

dfs = read_datasets(folder_path, dataset_types)

attack_df = drop_columns(dfs['attack'], ['team.id', 'rating'])
defense_df = drop_columns(dfs['defense'], ['team.id', 'rating', 'cleanSheet'])
keepers_df = drop_columns(dfs['keepers'], ['team.id', 'rating'])
passing_df = drop_columns(dfs['passing'], ['team.id', 'rating'])
others_df = drop_columns(dfs['others'], ['team.id', 'rating'])
player_info_df = dfs['players_info']

In [30]:
merged_df = merge_dataframes([player_info_df, attack_df, defense_df, keepers_df, passing_df, others_df], 'player.id')

# filter player
filtered_foward = merged_df.loc[(merged_df['position'] == 'F') & (merged_df['appearances'] >= 5)]
filtered_mid = merged_df.loc[(merged_df['position'] == 'M') & (merged_df['appearances'] >= 5)]
filtered_def = merged_df.loc[(merged_df['position'] == 'D') & (merged_df['appearances'] >= 5)]
filtered_gk = merged_df.loc[(merged_df['position'] == 'G') & (merged_df['appearances'] >= 5)]

In [31]:
fowards_filter = filtered_foward.filter([
                            'accuratePasses', 'expectedGoals', 'goalsFromInsideTheBox',  'headedGoals',
                            'bigChancesMissed', 'successfulDribbles', 'shotsOnTarget', 'aerialDuelsWon'
]).columns

mid_filter = filtered_mid.filter([
                            'accuratePasses', 'totalPasses', 'accurateFinalThirdPasses',  'keyPasses',
                            'bigChancesCreated', 'successfulDribbles', 'totalDuelsWon', 'interceptions'
]).columns

def_filter = filtered_def.filter([
                            'accuratePasses', 'tackles', 'interceptions',  'dribbledPast',
                            'errorLeadToGoal', 'headedGoals', 'groundDuelsWon', 'aerialDuelsWon'
]).columns


keepers_filter = filtered_gk.filter([
                            'accuratePasses', 'saves', 'savedShotsFromInsideTheBox',  'penaltySave',
                            'goalsConcededOutsideTheBox', 'punches', 'successfulRunsOut', 'highClaims'
]).columns

params = get_params_list(filtered_foward, fowards_filter)

In [32]:
def gen_percentiles(params, merged_df: pd.DataFrame, cols_filter):
    players = merged_df['player.id'].unique()
    percentile_data = []

    for player_id in players:
        player = get_player(player_id, merged_df, cols_filter)
        values = calculate_percentiles(params, merged_df, player)
        percentile_data.append({'player.id': player_id, 'percentiles': values})

    percentile_df = pd.DataFrame(percentile_data)

    return percentile_df

In [33]:
params = get_params_list(filtered_foward, fowards_filter)
fowards = gen_percentiles(params, filtered_foward, fowards_filter)
fowards.to_csv(os.path.join(folder_path, "fowards_percentiles.csv"), index=False)

In [35]:
params = get_params_list(filtered_mid, mid_filter)
mid = gen_percentiles(params, filtered_mid, mid_filter)
mid.to_csv(os.path.join(folder_path, "mid_percentiles.csv"), index=False)

In [38]:
params = get_params_list(filtered_def, def_filter)
defe = gen_percentiles(params, filtered_def, def_filter)
defe.to_csv(os.path.join(folder_path, "def_percentiles.csv"), index=False)

In [39]:
params = get_params_list(filtered_gk, keepers_filter)
gk = gen_percentiles(params, filtered_gk, keepers_filter)
gk.to_csv(os.path.join(folder_path, "gk_percentiles.csv"), index=False)