In [2]:
import pandas as pd

from scipy import stats
import math
import os

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
def read_datasets(folder_path, dataset_types):
    dfs = {}
    
    for dataset_type in dataset_types:
        file_path = f'{folder_path}\\{dataset_type}\\{dataset_type}_325.csv'
        dfs[dataset_type] = pd.read_csv(file_path)
    
    return dfs

def drop_columns(df, columns):
    return df.drop(columns, axis=1)

In [4]:
# Merge all DataFrames on 'player.id'
def merge_dataframes(dfs, on_column):
    merged_df = dfs[0]
    
    for df in dfs[1:]:
        merged_df = pd.merge(merged_df, df, on=on_column, how='left')
    
    return merged_df

def get_params_list(merged_df: pd.DataFrame, cols_filter):
    params_df = merged_df[cols_filter]
    params = list(params_df.columns)
    return params

def get_player(player_id, merged_df, cols_filter):
    player = merged_df.loc[merged_df['player.id'] == player_id].reset_index()
    player = player[cols_filter]
    return list(player.loc[0])

def calculate_percentiles(params, merged_df, player):
    values = []
    for x in range(len(params)):
        values.append(math.floor(stats.percentileofscore(merged_df[params[x]], player[x])))
    return values

In [5]:
folder_path = '325'
dataset_types = ['attack', 'defense', 'keepers', 'passing', 'others', 'players_info', 'teams_info']

dfs = read_datasets(folder_path, dataset_types)

attack_df = drop_columns(dfs['attack'], ['team.id', 'rating'])
defense_df = drop_columns(dfs['defense'], ['team.id', 'rating', 'cleanSheet'])
keepers_df = drop_columns(dfs['keepers'], ['team.id', 'rating'])
passing_df = drop_columns(dfs['passing'], ['team.id', 'rating'])
others_df = drop_columns(dfs['others'], ['team.id', 'rating'])
player_info_df = dfs['players_info']

In [6]:
merged_df = merge_dataframes([player_info_df, attack_df, defense_df, keepers_df, passing_df, others_df], 'player.id')

# filter player
merged_df = merged_df.loc[(merged_df['position'] == 'F') & (merged_df['appearances'] >= 5)]

In [7]:
fowards_filter = merged_df.filter([
                            'accuratePasses', 'bigChancesCreated', 'totalPasses', 
                            'bigChancesMissed', 'successfulDribbles', 'shotsOnTarget', 'aerialDuelsWon'
]).columns

params = get_params_list(merged_df, fowards_filter)

In [8]:
def gen_percentiles(params, merged_df: pd.DataFrame, cols_filter):
    players = merged_df['player.id'].unique()
    percentile_data = []

    for player_id in players:
        player = get_player(player_id, merged_df, cols_filter)
        values = calculate_percentiles(params, merged_df, player)
        percentile_data.append({'player.id': player_id, 'percentiles': values})

    percentile_df = pd.DataFrame(percentile_data)

    return percentile_df

In [9]:
fowards = gen_percentiles(params, merged_df, fowards_filter)
fowards.head(5)

Unnamed: 0,player.id,percentiles
0,34705,"[96, 100, 96, 84, 100, 99, 92]"
1,16943,"[99, 98, 100, 99, 87, 100, 78]"
2,789100,"[93, 68, 94, 49, 93, 96, 97]"
3,1046501,"[34, 83, 31, 49, 67, 68, 34]"
4,583974,"[61, 45, 68, 97, 34, 94, 99]"


In [10]:
fowards.to_csv("325\\percentiles\\fowards_percentiles.csv", index=False)