In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from ipywidgets import fixed, interact, interactive, interact_manual
import ipywidgets as widgets
from IPython.display import HTML, display

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_columns', None)

In [2]:
def read_games_data():
    return pd.read_csv(filepath_or_buffer="../data/games.csv", sep=',')

def read_players_data():
    return pd.read_csv(filepath_or_buffer="../data/players.csv", sep=',')

In [3]:
%%time

df_games = read_games_data()
df_players = read_players_data()

Wall time: 253 ms


In [4]:
players_list = sorted(df_players['player'].unique().tolist())
len(players_list)

2601

## `players_list` accessed globally - Change this

In [5]:
def search_player(name):
    name = str(name).lower()
    df_players_series = pd.DataFrame(data={
        'player': players_list
    })
    df_players_series['player_lowercase'] = df_players_series['player'].str.lower()
    df_matches = df_players_series[df_players_series['player_lowercase'].str.contains(name)]
    df_matches.reset_index(drop=True, inplace=True)
    if df_matches.empty:
        return []
    list_matching_player_names = df_matches['player'].tolist()
    return list_matching_player_names

In [6]:
df_games.shape, df_players.shape

((3450, 127), (2732, 151))

In [7]:
# print(df_games.columns.tolist(), "\n\n", df_players.columns.tolist())

In [8]:
def convert_dtypes(dataframe, columns, dtypes):
    """
    Converts datatypes of columns in DataFrame.
    Parameters:
        - dataframe (Pandas DataFrame): DataFrame to save
        - columns (list): List of column-names
        - dtypes (list): List of datatypes (in same order as columns)
    Usage example:
        - convert_dtypes(dataframe=df, columns=['age', 'name', 'gpa'], dtypes=['int', 'str', 'float'])
    Returns DataFrame with specified columns converted to appropriate datatype.
    """
    dataframe_altered = dataframe.copy()
    columns_in_dataframe = dataframe_altered.columns.tolist()
    for column, dtype in zip(columns, dtypes):
        if column in columns_in_dataframe:
            dataframe_altered[column] = dataframe_altered[column].astype(dtype)
    return dataframe_altered


def get_timetaken_fstring(num_seconds):
    """ Returns formatted-string of time elapsed, given the number of seconds (int) elapsed """
    if num_seconds < 60:
        secs = num_seconds
        fstring_timetaken = f"{secs}s"
    elif 60 < num_seconds < 3600:
        mins, secs = divmod(num_seconds, 60)
        fstring_timetaken = f"{mins}m {secs}s"
    else:
        hrs, secs_remainder = divmod(num_seconds, 3600)
        mins, secs = divmod(secs_remainder, 60)
        fstring_timetaken = f"{hrs}h {mins}m {secs}s"
    return fstring_timetaken


def run_and_timeit(func):
    """
    Takes in function-name; then runs it, times it, and prints out the time taken.
    Parameters:
        - func (object): Object of the function you want to execute.
    """
    start = time.time()
    warnings.filterwarnings(action='ignore')
    func()
    end = time.time()
    timetaken_in_secs = int(np.ceil(end - start))
    timetaken_fstring = get_timetaken_fstring(num_seconds=timetaken_in_secs)
    print(f"\nDone! Time taken: {timetaken_fstring}")
    return None

In [9]:
def pickle_load(filename):
    """ Loads data from pickle file, via joblib module """
    data_obj = joblib.load(filename=filename)
    return data_obj


def pickle_save(data_obj, filename):
    """ Stores data as pickle file, via joblib module """
    joblib.dump(value=data_obj, filename=filename)
    return None


def get_points_from_result(result_obj):
    result_obj = str(result_obj).strip().upper()
    if result_obj == 'W':
        points = 3
    elif result_obj == 'D':
        points = 1
    elif result_obj == 'L':
        points = 0
    else:
        raise ValueError(f"Result object not in ['W', 'L', 'D']. Object is: '{result_obj}'")
    return points

## Transformation pipeline

In [10]:
columns_player_features = pickle_load(filename="../pickle/player_feature_columns.pkl")
len(columns_player_features)

101

In [11]:
df_games['points_obtained'] = df_games['result'].apply(get_points_from_result)
df_players.drop_duplicates(subset=['player'], keep='last', inplace=True)

In [12]:
# Per 90 stats
for column in columns_player_features:
    if column != 'minutes':
        df_players[column] = (df_players[column] / df_players['minutes']) * 90

In [13]:
columns_to_drop = ['goals_per90', 'cards_yellow', 'cards_red', 'assists_per90', 'goals_assists_per90',
                   'goals_pens_per90', 'goals_assists_pens_per90', 'xg_per90', 'xa_per90', 'xg_xa_per90',
                   'npxg_per90', 'npxg_xa_per90', 'minutes_90s', 'shots_total_per90', 'shots_on_target_per90',
                   'xa_net', 'sca_per90', 'gca_per90', 'passes_received', 'cards_yellow_red', 'fouls',
                   'fouled', 'offsides', 'pens_won', 'pens_conceded', 'own_goals', 'ball_recoveries',
                   'aerials_won', 'aerials_lost', 'aerials_won_pct']
df_players.drop(labels=columns_to_drop, axis=1, inplace=True)

In [14]:
df_players.sample(5)

Unnamed: 0,player,nationality,position,squad,age,birth_year,games,games_starts,minutes,goals,assists,pens_made,pens_att,xg,npxg,xa,shots_total,shots_on_target,shots_free_kicks,shots_on_target_pct,goals_per_shot,goals_per_shot_on_target,npxg_per_shot,xg_net,npxg_net,passes_completed,passes,passes_pct,passes_total_distance,passes_progressive_distance,passes_completed_short,passes_short,passes_pct_short,passes_completed_medium,passes_medium,passes_pct_medium,passes_completed_long,passes_long,passes_pct_long,assisted_shots,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,progressive_passes,passes_live,passes_dead,passes_free_kicks,through_balls,passes_pressure,passes_switches,crosses,corner_kicks,corner_kicks_in,corner_kicks_out,corner_kicks_straight,passes_ground,passes_low,passes_high,passes_left_foot,passes_right_foot,passes_head,throw_ins,passes_other_body,passes_offsides,passes_oob,passes_intercepted,passes_blocked,sca,sca_passes_live,sca_passes_dead,sca_dribbles,sca_shots,sca_fouled,gca,gca_passes_live,gca_passes_dead,gca_dribbles,gca_shots,gca_fouled,gca_og_for,tackles,tackles_won,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,dribble_tackles,dribbles_vs,dribble_tackles_pct,dribbled_past,pressures,pressure_regains,pressure_regain_pct,pressures_def_3rd,pressures_mid_3rd,pressures_att_3rd,blocks,blocked_shots,blocked_shots_saves,blocked_passes,interceptions,clearances,errors,touches,touches_def_pen_area,touches_def_3rd,touches_mid_3rd,touches_att_3rd,touches_att_pen_area,touches_live_ball,dribbles_completed,dribbles,dribbles_completed_pct,players_dribbled_past,nutmegs,carries,carry_distance,carry_progressive_distance,pass_targets,passes_received_pct,miscontrols,dispossessed
977,Moi Gómez,es ESP,"FW,MF",Villarreal,25,1994,37.0,26.0,2319.0,0.194049,0.155239,0.0,0.0,0.12031,0.12031,0.116429,1.435964,0.543338,0.03881,37.8,0.14,0.36,0.09,0.073739,0.073739,35.743855,44.049159,81.1,617.930142,149.417853,0.853816,2.40621,35.5,29.184994,32.445019,90.0,5.705045,9.19793,62.0,0.892626,2.173351,1.319534,0.620957,3.842173,43.428202,0.620957,0.116429,0.116429,8.460543,1.746442,3.065977,0.310479,0.116429,0.03881,0.03881,29.417853,6.636481,7.994825,15.523933,26.895213,1.047865,0.155239,0.07762,0.155239,0.543338,0.698577,1.862872,2.3674,1.824062,0.07762,0.116429,0.194049,0.155239,0.426908,0.271669,0.0,0.03881,0.03881,0.07762,0.0,1.086675,0.620957,0.504528,0.310479,0.271669,0.310479,1.009056,30.8,0.698577,16.455369,4.54075,27.6,3.919793,8.771022,3.764554,1.979301,0.116429,0.0,1.862872,0.931436,0.737387,0.03881,54.489004,1.513583,9.12031,25.653299,23.208279,2.794308,53.906856,0.815006,1.862872,43.8,1.009056,0.07762,40.362225,237.011643,117.089263,50.452781,82.3,1.435964,1.280724
2689,Amin Younes,de GER,"FW,MF",Napoli,25,1993,9.0,1.0,176.0,0.511364,0.0,0.0,0.0,0.460227,0.460227,0.153409,3.579545,0.511364,0.0,14.3,0.14,1.0,0.13,0.051136,0.051136,45.511364,56.761364,80.2,608.522727,169.261364,1.534091,3.579545,42.9,41.420455,46.534091,89.0,2.556818,6.647727,38.5,2.045455,3.068182,2.045455,0.511364,5.625,56.25,0.511364,0.0,0.0,18.920455,0.0,0.511364,0.0,0.0,0.0,0.0,46.022727,8.181818,2.556818,8.181818,47.556818,0.511364,0.511364,0.0,0.0,0.511364,2.045455,2.556818,5.113636,5.113636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.045455,1.534091,0.511364,1.022727,0.511364,0.511364,1.022727,50.0,0.511364,20.454545,6.647727,32.5,3.068182,12.784091,4.602273,1.022727,0.0,0.0,1.022727,0.0,0.511364,0.0,73.125,1.022727,8.693182,37.840909,32.727273,7.670455,72.613636,1.534091,4.090909,37.5,1.534091,0.511364,55.738636,359.488636,207.102273,67.5,89.4,2.045455,1.534091
1331,Juraj Kucka,sk SVK,"MF,FW",Parma,32,1987,26.0,22.0,1932.0,0.279503,0.139752,0.093168,0.093168,0.214286,0.14441,0.088509,2.096273,0.465839,0.046584,22.2,0.09,0.4,0.07,0.065217,0.041925,23.664596,33.959627,69.7,415.015528,135.838509,0.652174,1.723602,37.8,19.23913,25.434783,75.6,3.773292,6.801242,55.5,0.885093,1.863354,0.512422,0.093168,2.795031,33.167702,0.791925,0.232919,0.139752,8.990683,1.21118,1.21118,0.0,0.0,0.0,0.0,17.748447,7.173913,9.037267,7.965839,19.704969,2.888199,0.372671,0.139752,0.279503,0.559006,1.350932,1.257764,2.049689,1.537267,0.0,0.186335,0.139752,0.186335,0.465839,0.232919,0.0,0.093168,0.093168,0.046584,0.0,2.375776,1.630435,1.164596,0.931677,0.279503,0.838509,3.307453,25.4,2.468944,20.729814,5.403727,26.1,6.149068,10.807453,3.773292,1.677019,0.326087,0.0,1.350932,1.21118,1.863354,0.0,48.86646,3.02795,10.481366,25.434783,15.698758,2.65528,47.981366,1.956522,2.888199,67.7,2.003106,0.512422,29.767081,162.717391,84.31677,41.506211,77.1,2.65528,1.164596
1836,Marco Olivieri,it ITA,FW,Juventus,20,1999,3.0,0.0,30.0,0.0,0.0,0.0,0.0,0.3,0.3,0.0,3.0,3.0,0.0,100.0,0.0,0.0,0.06,-0.3,-0.3,33.0,45.0,73.3,534.0,69.0,0.0,3.0,0.0,30.0,33.0,90.9,3.0,9.0,33.3,0.0,0.0,0.0,0.0,0.0,39.0,6.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,36.0,6.0,3.0,9.0,30.0,0.0,6.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,3.0,6.0,50.0,3.0,39.0,6.0,15.4,3.0,18.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,0.0,6.0,15.0,45.0,15.0,60.0,3.0,3.0,100.0,3.0,0.0,54.0,348.0,183.0,63.0,90.5,6.0,6.0
593,Mickaël Cuisance,fr FRA,"MF,FW",Bayern Munich,19,1999,9.0,3.0,307.0,0.29316,0.0,0.0,0.0,0.175896,0.175896,0.058632,3.517915,0.29316,0.29316,8.3,0.08,1.0,0.05,0.117264,0.117264,45.732899,56.286645,81.3,896.188925,220.749186,0.29316,2.345277,12.5,33.127036,36.938111,89.7,12.312704,17.003257,72.4,1.172638,4.690554,1.172638,0.29316,5.863192,50.423453,5.863192,1.172638,0.0,6.742671,2.638436,4.397394,3.811075,1.465798,0.0,1.465798,40.162866,8.501629,7.62215,46.612378,8.501629,0.879479,0.29316,0.0,0.0,1.465798,2.345277,1.465798,4.104235,3.224756,0.29316,0.29316,0.0,0.29316,0.879479,0.586319,0.0,0.0,0.0,0.29316,0.0,1.465798,0.879479,0.29316,0.586319,0.586319,0.29316,2.931596,10.0,2.638436,25.504886,5.570033,21.8,4.690554,11.433225,9.381107,1.465798,0.29316,0.0,1.172638,0.879479,0.586319,0.0,67.71987,0.879479,8.501629,34.006515,30.19544,2.931596,62.149837,3.224756,5.276873,61.1,3.517915,0.29316,51.302932,295.504886,169.739414,58.631922,90.5,0.879479,1.172638


In [15]:
irrelevant_features = ['player', 'nationality', 'position', 'squad', 'age', 'birth_year', 'games', 'games_starts', 'minutes']
df_players_new = df_players.drop(labels=irrelevant_features, axis=1)
new_columns = df_players_new.columns.tolist()

In [16]:
scaler = MinMaxScaler()
for column in new_columns:
    df_players_new[column] = scaler.fit_transform(df_players_new[[column]])

In [17]:
df_games.rename(columns={'xg_for': 'xg'}, inplace=True)
df_players_new1 = df_players_new.copy()

In [18]:
def subset_shot_data(data):
    columns = ['goals', 'xg', 'npxg', 'shots_total', 'shots_on_target', 'shots_free_kicks', 'shots_on_target_pct',
               'goals_per_shot', 'goals_per_shot_on_target', 'npxg_per_shot', 'xg_net', 'npxg_net']
    return data.loc[:, columns]


def subset_creativity_data(data):
    columns = ['sca', 'sca_passes_live', 'sca_passes_dead', 'sca_dribbles', 'sca_shots', 'sca_fouled', 'assisted_shots',
               'through_balls', 'gca', 'gca_passes_live', 'gca_passes_dead', 'gca_dribbles', 'gca_shots', 'gca_fouled',
               'gca_og_for','assists','xa']
    return data.loc[:, columns]


def subset_passing_data(data):
    columns = ['passes_completed', 'passes', 'passes_pct', 'passes_total_distance', 'passes_progressive_distance',
               'passes_completed_short', 'passes_short', 'passes_pct_short', 'passes_completed_medium', 'passes_medium',
               'passes_pct_medium', 'passes_completed_long', 'passes_long', 'passes_pct_long', 'passes_into_final_third',
               'passes_into_penalty_area', 'crosses_into_penalty_area', 'progressive_passes', 'passes_live', 'passes_dead',
               'passes_free_kicks', 'passes_pressure', 'passes_switches', 'crosses', 'corner_kicks', 'corner_kicks_in',
               'corner_kicks_out', 'corner_kicks_straight', 'passes_ground', 'passes_low', 'passes_high', 'passes_left_foot',
               'passes_right_foot', 'passes_head', 'throw_ins', 'passes_other_body', 'passes_offsides', 'passes_oob',
               'passes_intercepted', 'passes_blocked']
    return data.loc[:, columns]


def subset_defending_data(data):
    columns = ['tackles', 'tackles_won', 'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd', 'dribble_tackles',
               'dribbles_vs', 'dribble_tackles_pct', 'dribbled_past', 'pressures', 'pressure_regains', 'pressure_regain_pct',
               'pressures_def_3rd', 'pressures_mid_3rd', 'pressures_att_3rd', 'blocks', 'blocked_shots', 'blocked_shots_saves',
               'blocked_passes', 'interceptions', 'clearances', 'errors']
    return data.loc[:, columns]


def subset_possession_data(data):
    columns = ['touches', 'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd', 'touches_att_3rd',
               'touches_att_pen_area', 'touches_live_ball', 'dribbles_completed', 'dribbles', 'dribbles_completed_pct',
               'players_dribbled_past', 'nutmegs', 'carries', 'carry_distance', 'carry_progressive_distance',
               'pass_targets', 'passes_received_pct', 'miscontrols', 'dispossessed']
    return data.loc[:, columns]

In [19]:
# players = np.array(df_players['player'])
# teams = np.array(df_players['squad'].unique())
# teams = np.append('Overall', sorted(teams))

## PCA related

In [35]:
# Inputs

player_name = 'Robert Lewandowski' # Robert Lewandowski, Joshua Kimmich, Javi Martínez, Benjamin Pavard, Alphonso Davies
team = 'Bayern Munich'
skill = 'Overall' # Options: ['Overall', 'Possession', 'Shooting', 'Passing', 'Chance creation', 'Defensive work']
number_of_results = 20

In [21]:
search_player(name='davi')

['Alphonso Davies',
 'Ben Davies',
 'David Abraham',
 'David Alaba',
 'David Brooks',
 'David Costas',
 'David García',
 'David Juncà',
 'David Luiz',
 'David López',
 'David Martin',
 'David McGoldrick',
 'David Ospina',
 'David Silva',
 'David Soria',
 'David Timor',
 'David Zurutuza',
 'David de Gea',
 'Davide Biraschi',
 'Davide Calabria',
 'Davide Faraoni',
 'Davide Riccardi',
 'Davide Santon',
 'Davide Zappacosta',
 'Davie Selke',
 'Davinson Sánchez',
 'Keinan Davis',
 'Tom Davies']

In [36]:
'player' in df_players_new1.columns

False

In [37]:
df_players_new1 = df_players_new.copy()
print("df_players_new1.shape: ", df_players_new1.shape)
df_games = df_games.loc[:, ~df_games.T.duplicated(keep='first')]
df_games = df_games.loc[:, ~df_games.columns.duplicated()]

if team == 'Overall':
    corrMatrix = df_games.corr()
else:
    corrMatrix = df_games[df_games['for'] == team].corr()


player_features = df_players_new1.columns.tolist()
for player_feature in player_features:
    df_players_new1[player_feature] = (df_players_new1[player_feature]) * (corrMatrix['points_obtained'][player_feature])

if skill == 'Overall':
    df_players_by_skill = df_players_new1.copy()
elif skill == 'Possession':
    df_players_by_skill = subset_possession_data(data=df_players_new1)
elif skill == 'Shooting':
    df_players_by_skill = subset_shot_data(data=df_players_new1)
elif skill == 'Passing':
    df_players_by_skill = subset_passing_data(data=df_players_new1)
elif skill == 'Chance creation':
    df_players_by_skill = subset_creativity_data(data=df_players_new1)
elif skill == 'Defensive work':
    df_players_by_skill = subset_defending_data(data=df_players_new1)
else:
    raise ValueError(f"Invalid skill entered: '{skill}'")

df_players_new1.shape:  (2601, 112)


In [38]:
df_players_by_skill.head()

Unnamed: 0,goals,assists,pens_made,pens_att,xg,npxg,xa,shots_total,shots_on_target,shots_free_kicks,shots_on_target_pct,goals_per_shot,goals_per_shot_on_target,npxg_per_shot,xg_net,npxg_net,passes_completed,passes,passes_pct,passes_total_distance,passes_progressive_distance,passes_completed_short,passes_short,passes_pct_short,passes_completed_medium,passes_medium,passes_pct_medium,passes_completed_long,passes_long,passes_pct_long,assisted_shots,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,progressive_passes,passes_live,passes_dead,passes_free_kicks,through_balls,passes_pressure,passes_switches,crosses,corner_kicks,corner_kicks_in,corner_kicks_out,corner_kicks_straight,passes_ground,passes_low,passes_high,passes_left_foot,passes_right_foot,passes_head,throw_ins,passes_other_body,passes_offsides,passes_oob,passes_intercepted,passes_blocked,sca,sca_passes_live,sca_passes_dead,sca_dribbles,sca_shots,sca_fouled,gca,gca_passes_live,gca_passes_dead,gca_dribbles,gca_shots,gca_fouled,gca_og_for,tackles,tackles_won,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,dribble_tackles,dribbles_vs,dribble_tackles_pct,dribbled_past,pressures,pressure_regains,pressure_regain_pct,pressures_def_3rd,pressures_mid_3rd,pressures_att_3rd,blocks,blocked_shots,blocked_shots_saves,blocked_passes,interceptions,clearances,errors,touches,touches_def_pen_area,touches_def_3rd,touches_mid_3rd,touches_att_3rd,touches_att_pen_area,touches_live_ball,dribbles_completed,dribbles,dribbles_completed_pct,players_dribbled_past,nutmegs,carries,carry_distance,carry_progressive_distance,pass_targets,passes_received_pct,miscontrols,dispossessed
0,0.006563,0.008857,0.001082,0.001082,0.006528,0.003543,0.001522,0.00036,0.000416,0.001892,0.05421,0.049589,0.232113,0.022727,0.095088,0.084927,-0.010559,-0.013482,-0.049818,-0.012665,-0.004935,0.000907,0.003905,-0.005367,-0.007733,-0.009619,0.007638,-0.012457,-0.022307,-0.051669,0.001265,0.00722,0.001167,0.001152,-0.003005,-0.006493,-0.084939,-0.080317,0.0,-0.038447,-0.000261,-0.002021,0.000631,0.008128,-0.003706,8.1e-05,-0.007962,0.01681,-0.022346,0.033592,-0.005594,0.001269,-0.05851,-0.000563,-0.001672,0.001339,-0.00867,0.003804,0.00329,0.001156,-0.007527,0.004878,0.005176,-0.000158,0.012983,0.006756,0.007248,0.003096,0.0,0.0,0.0,0.004384,0.000721,0.000606,0.003961,0.000799,0.002714,0.000262,0.034325,-0.000617,0.003662,-0.00019,-0.016478,0.0011,0.000981,0.000842,0.005897,-0.000216,0.002778,0.00523,-0.000139,0.001276,-0.002536,-0.001291,-0.000567,-0.000642,-0.001511,0.000686,-0.000218,-0.000678,-0.000649,-0.000476,-0.063733,-0.000559,0.000407,-0.002537,-0.003516,-0.002694,-0.001332,0.243648,-0.000426,-0.000249
1,0.0,0.003427,0.0,0.0,0.001098,0.000914,0.001292,0.00015,0.000258,0.0,0.08075,0.0,0.0,0.011364,0.093861,0.084031,-0.008763,-0.011276,-0.049376,-0.009359,-0.002815,0.001403,0.005179,-0.006249,-0.006671,-0.008351,0.007585,-0.007441,-0.014279,-0.048225,0.000734,0.002892,0.001156,0.002318,-0.001324,-0.005798,-0.056285,-0.027064,6.6e-05,-0.038996,-5.8e-05,-0.00128,0.0,0.0,-0.0,0.0,-0.006744,0.015794,-0.015539,0.005709,-0.018633,0.002103,-0.049553,-0.00029,-0.0,0.002443,-0.0041,0.004384,0.002217,0.001174,-0.000506,0.003775,0.006007,-0.000245,0.006027,0.001742,0.005609,0.0,0.0,0.003221,0.0,0.004257,0.000627,0.000568,0.003256,0.002163,0.002005,0.000169,0.039385,-0.000358,0.005146,-0.000263,-0.016181,0.001504,0.001518,0.001092,0.008006,-0.000386,0.0043,0.006438,-8.2e-05,0.001267,-0.0,-0.001159,-0.000822,-0.000754,-0.001196,0.000592,-0.000172,-0.000656,-0.001366,-0.000961,-0.06641,-0.001321,0.000315,-0.002593,-0.004566,-0.003618,-0.001336,0.235182,-0.000612,-0.000577
2,0.006529,0.0,0.0,0.0,0.003671,0.003055,4.9e-05,0.000262,0.000414,0.0,0.074256,0.099178,0.348169,0.026515,0.096206,0.086131,-0.011369,-0.013042,-0.055445,-0.017509,-0.00479,0.000507,0.001595,-0.007337,-0.006431,-0.007544,0.008095,-0.03005,-0.038198,-0.072779,7.9e-05,0.003802,0.000139,0.0,-0.001149,-0.0079,-0.017209,-0.096656,8.4e-05,-0.031702,-0.000364,-2.3e-05,0.0,0.0,-0.0,0.0,-0.009727,0.007323,-0.016855,0.033787,-0.008935,0.003966,-0.000635,-0.00056,-0.000624,0.001999,-0.00599,0.000966,0.000845,0.000395,-0.0,0.0,0.010299,-7.9e-05,0.002583,0.00224,0.0,0.0,0.0,0.0,0.0,0.005217,0.000829,0.000786,0.00394,0.000795,0.001718,0.000135,0.042168,-0.000268,0.004079,-0.000284,-0.02207,0.001301,0.001333,0.000294,0.004426,-0.000314,0.0,0.002838,-0.000147,0.003145,-0.000841,-0.001296,-0.001519,-0.001366,-0.001364,7.8e-05,-0.000115,-0.000825,-0.000753,-0.000342,-0.102993,-0.000648,0.0,-0.003031,-0.00475,-0.003688,-0.001554,0.258198,-0.000242,-0.000124
3,0.034933,0.0,0.0,0.0,0.058921,0.049033,0.001568,0.001328,0.003987,0.0,0.141171,0.105377,0.191493,0.079545,0.088892,0.079583,-0.002473,-0.005527,-0.02845,-0.003009,-0.001122,0.0,0.005567,-0.0,-0.001941,-0.003669,0.005019,-0.002061,-0.007556,-0.025261,0.000631,0.002713,0.001491,0.003679,-0.001111,-0.00353,-0.0,-0.0,0.0,-0.022124,-0.000238,-0.002935,0.0,0.0,-0.0,0.0,-0.002871,0.007502,-0.011661,0.004304,-0.008025,0.004823,-0.0,-0.0,-0.0,0.001528,-0.003846,0.007754,0.001695,0.000577,-0.0,0.025967,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001373,0.00036,0.000293,0.0,0.0,0.00197,0.000232,0.028084,-0.000615,0.005475,-0.000369,-0.021415,0.000994,0.002427,0.001817,0.004956,-0.000265,0.0,0.003796,-4.8e-05,0.000608,-0.0,-0.000695,-9.5e-05,-0.000175,-0.000601,0.000679,-0.000683,-0.000461,-0.001151,-0.000844,-0.063733,-0.001487,0.003248,-0.001499,-0.002772,-0.002433,-0.001517,0.133861,-0.001942,-0.000497
4,0.003018,0.01222,0.0,0.0,0.006266,0.005214,0.000949,0.000631,0.000804,0.002611,0.059856,0.018596,0.081239,0.018939,0.093364,0.083586,-0.009511,-0.011426,-0.052916,-0.012044,-0.002829,0.000469,0.00202,-0.005367,-0.006624,-0.007768,0.008095,-0.014516,-0.023317,-0.057587,0.001418,0.009961,0.000386,0.001272,-0.001574,-0.006588,-0.028457,-0.07507,0.000117,-0.037516,-0.000514,-0.001965,0.00091,0.007802,-0.005454,0.000262,-0.00808,0.008139,-0.016122,0.004835,-0.023758,0.003501,-0.001174,-0.002072,-0.000865,0.003036,-0.003655,0.00134,0.003295,0.001345,-0.006322,0.002244,0.007142,-0.000109,0.014331,0.006214,0.010001,0.0,0.009616,0.0,0.0,0.007116,0.001274,0.000608,0.011612,0.001102,0.004086,0.000415,0.032638,-0.00101,0.00789,-0.000404,-0.01624,0.001155,0.004442,0.001978,0.00257,-6.9e-05,0.0,0.00246,-9.1e-05,0.000578,-0.0,-0.001139,-0.000181,-0.000353,-0.00178,0.000503,-0.000118,-0.0007,-0.000597,-0.000401,-0.069469,-0.000514,0.0,-0.002738,-0.00369,-0.002513,-0.001454,0.235182,-0.000392,-0.000558


In [39]:
df_players.shape

(2601, 121)

In [40]:
'player' in df_players.columns

True

In [41]:
features = df_players_by_skill.columns.tolist()
X = df_players_by_skill.loc[:, features].values
X = np.nan_to_num(X)
y = df_players.loc[:, ['player']].values


pca = PCA(n_components=0.9)
nd_array_principal_components = pca.fit_transform(X)
print(f"Number of components used in PCA: {pca.n_components_}")
df_principal_components = pd.DataFrame(data=nd_array_principal_components)
df_players.reset_index(drop=True, inplace=True)
df_final = pd.concat(objs=[df_principal_components, df_players[['player']]], axis=1)
df_final = pd.concat(objs=[df_final, df_players[['squad']]], axis=1)
df_final = pd.concat(objs=[df_final, df_players[['position']]], axis=1)
df_final = pd.concat(objs=[df_final, df_players[['age']]], axis=1)
player = player_name

Number of components used in PCA: 9


In [42]:
# def get_ranked_player_distances(data_similar_players, player, num_results):
#     columns_to_drop = ['player', 'squad', 'position', 'age']
#     df_stats = data_similar_players.drop(labels=columns_to_drop, axis=1)
#     df_stats_by_player = data_similar_players[data_similar_players['player'] == player]
#     df_stats_by_player.drop(labels=columns_to_drop, axis=1, inplace=True)

#     columns_of_stats = df_stats.columns.tolist()
#     data_similar_players['distance'] = (data_similar_players[columns_of_stats] - np.array(df_stats_by_player)).pow(2).sum(1).pow(0.5)
    
#     columns_to_show = ['player', 'squad', 'position', 'age', 'percent_match']
#     distance_at_quantile = data_similar_players['distance'].quantile(q=0.95)
#     print("distance_at_quantile: ", distance_at_quantile)
#     series_percent_match = (100 - (data_similar_players['distance'].mul(100) / distance_at_quantile)).apply(round, args=[3])
#     data_similar_players['percent_match'] = series_percent_match
#     data_similar_players.sort_values(by='distance', ascending=True, inplace=True, ignore_index=True)
#     data_similar_players = data_similar_players.head(num_results + 1)
#     data_similar_players = data_similar_players.loc[:, columns_to_show]
#     return data_similar_players

def rank_similar_players(data_similar_players, player, skill, num_results):
    columns_to_drop = ['player', 'squad', 'position', 'age']
    df_stats = data_similar_players.drop(labels=columns_to_drop, axis=1)
    df_stats_by_player = data_similar_players[data_similar_players['player'] == player]
    df_stats_by_player.drop(labels=columns_to_drop, axis=1, inplace=True)

    columns_of_stats = df_stats.columns.tolist()
    distance = (data_similar_players[columns_of_stats] - np.array(df_stats_by_player)).pow(2).sum(1).pow(0.5)
    data_similar_players['distance'] = distance
    
    distance_at_quantile = data_similar_players['distance'].quantile(q=0.95)
    print(f"\nDistance at 95th quantile: {distance_at_quantile}")
    series_percent_match = (100 - (data_similar_players['distance'].mul(100) / distance_at_quantile)).apply(round, args=[3])
    data_similar_players['percent_match'] = series_percent_match
    data_similar_players.sort_values(by='distance', ascending=True, inplace=True, ignore_index=True)
    data_similar_players = data_similar_players.head(num_results + 1)
    data_similar_players['skill'] = skill
    columns_to_show = ['player', 'squad', 'position', 'age', 'skill', 'percent_match']
    data_similar_players = data_similar_players.loc[:, columns_to_show]
    return data_similar_players

In [43]:
player

'Robert Lewandowski'

In [44]:
df_final.shape

(2601, 13)

In [45]:
df_final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,player,squad,position,age
0,0.105954,-0.081725,0.015061,0.018516,-0.037175,0.052194,0.0063,-0.029341,0.005993,Patrick van Aanholt,Crystal Palace,DF,28
1,-0.114915,0.017794,0.023018,0.000441,0.007542,0.045237,-0.0057,-0.027095,-0.002735,Max Aarons,Norwich City,DF,19
2,0.235657,-0.094339,0.035776,-0.02132,-0.015034,-0.026791,-0.019306,-0.002651,0.008812,Yunis Abdelhamid,Reims,DF,31
3,0.123794,0.119231,-0.039205,0.026608,-0.01331,-0.023062,-0.008835,-0.026228,0.008591,Suleiman Abdullahi,Union Berlin,"FW,MF",22
4,-0.037725,-0.023448,0.019729,1.9e-05,-0.018096,-0.013462,-0.004582,0.011027,0.006162,Mehdi Abeid,Nantes,MF,26


In [46]:
rank_similar_players(data_similar_players=df_final,
                     player=player,
                     skill=skill,
                     num_results=15)


Distance at 95th quantile: 0.4040605307376015


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,player,squad,position,age,skill,percent_match
0,Robert Lewandowski,Bayern Munich,FW,30,Overall,100.0
1,Duván Zapata,Atalanta,FW,28,Overall,92.511
2,Anthony Martial,Manchester Utd,FW,23,Overall,91.154
3,Adrien Hunou,Rennes,"FW,MF",25,Overall,91.128
4,Bas Dost,Eint Frankfurt,FW,30,Overall,91.035
5,Sargis Adamyan,Hoffenheim,"FW,MF",26,Overall,90.934
6,Pierre-Emerick Aubameyang,Arsenal,FW,30,Overall,90.817
7,Jhon Córdoba,Köln,FW,26,Overall,90.213
8,Niclas Füllkrug,Werder Bremen,FW,26,Overall,90.059
9,Sadio Mané,Liverpool,FW,27,Overall,89.794


In [None]:
df_final.head()

In [None]:
df_final.shape

In [None]:
# df_games.loc[:, ~df_games.T.duplicated(keep='first')]

In [None]:
player

In [None]:
class Color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [None]:
# a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])]
# b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]
# finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)
# dist = finalDf['distance'].max()
# dist2 = finalDf['distance'].quantile(0.95)
# finalDf['% match'] = 100-(finalDf['distance']/dist2)*100
# final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]
# final = final.reset_index(drop=True)
# print(color.BOLD + 'List of similar players:' + color.END)
# print('\n')
# print(final[['player','squad','position','age','% match']])

In [None]:
columns_to_drop = ['player', 'squad', 'position', 'age']
df_stats_by_player = df_final[df_final['player'] == player].drop(labels=columns_to_drop, axis=1)
df_stats = df_final.drop(labels=columns_to_drop, axis=1)

columns_of_stats = df_stats.columns.tolist()
df_final['distance'] = (df_final[columns_of_stats] - np.array(df_stats_by_player)).pow(2).sum(1).pow(0.5)

columns_to_show = ['player', 'squad', 'position', 'age', 'percent_match']
# distance_max = df_final['distance'].max()
distance_at_quantile = df_final['distance'].quantile(q=0.95)
df_final['percent_match'] = (100 - (df_final['distance'].mul(100) / distance_at_quantile)).apply(round, args=[3])
df_final = df_final.sort_values(by='distance', ascending=True).reset_index(drop=True).head(number_of_results)
df_final = df_final.loc[:, columns_to_show]

In [None]:
distance_at_quantile

In [None]:
print(f"Players similar to '{player}' with regard to '{skill}' skill")
df_final

## Retain original `players` dataframe for this!

In [None]:
df_similar_player_stats = pd.DataFrame()
similar_players = df_final['player'].tolist()

for similar_player in similar_players:
    df_temp = df_players[df_players['player'] == similar_player]
    df_similar_player_stats = pd.concat(objs=[df_similar_player_stats, df_temp], ignore_index=True, sort=False)

In [None]:
df_similar_player_stats