In [11]:
import pandas as pd
import numpy as np
import requests
import bs4

In [199]:
#seasons that we want the data for both team and player stats
seasons =['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']
#metrics we want to include in our team statistics
team_metrics = ['stats', 'shooting', 'keepersadv', 'passing', 'defense', 'passing_types', 'possession']
#metrics we want to include in our player statistics
player_metrics = ['stats', 'shooting', 'passing', 'defense', 'passing_types', 'possession', 'gca', 'misc']
gk_metrics = ['keepers', 'keepersadv']

In [15]:
#renaming columns for column function
team_advgk_colnames = {
    ('Passes', 'AvgLen'): ('Passes', 'AvgLenGKPass'),
    ('Passes', 'Att'): ('Passes', 'GK Passes Attempted'),
    ('Passes', 'Launch%'): ('Passes', 'GK Passes Launch%'),
    ('Goal Kicks', 'Launch%'): ('Goal Kicks', 'GoalKick Launch%'),
    ('Goal Kicks', 'AvgLen'): ('Goal Kicks', 'AvgLenGoalKick'),
    ('Crosses', 'Opp'): ('Crosses', 'Opponent Crosses Attempted'),
    ('Sweeper', '#OPA/90'): ('Sweeper', 'Defensive Actions Outside Penalty Box'),
    ('Sweeper', 'AvgDist'): ('Sweeper', 'AvgDist Defensive Action Outside Penalty Box')}

team_passing_colnames = {
    ('Total', 'Att') : ('Total', 'Total Passes Attempted'),
    ('Total', 'Cmp%') : ('Total', 'Total Pass Completion%'),
    ('Short', 'Att') : ('Short', 'Short Passes%'),
    ('Short', 'Cmp%') : ('Short', 'Short Pass Completion%'),
    ('Medium', 'Att') : ('Medium', 'Medium Passes%'),
    ('Medium', 'Cmp%') : ('Medium', 'Medium Pass Completion%'),
    ('Long', 'Cmp%') : ('Long', 'Long Pass Completion%'),
    ('Unnamed: 22_level_0', 'KP') : ('Unnamed: 22_level_0', 'Key Passes'),
    ('Unnamed: 23_level_0', '1/3'): ('Unnamed: 23_level_0', 'Passes Into Final Third'),
    ('Unnamed: 24_level_0', 'PPA') : ('Unnamed: 24_level_0', 'Completed Passes 18 Yard Box'),
    ('Unnamed: 25_level_0', 'CrsPA') : ('Unnamed: 25_level_0', 'Completed Crosses 18 Yard Box'),
    ('Unnamed: 26_level_0', 'Prog') : ('Unnamed: 26_level_0', 'Progressive Passes')
    }

team_defensive_colnames = {
        ('Tackles', 'Tkl') : ('Tackles', 'Tackles Attempted'),
        ('Tackles', 'Def 3rd') : ('Tackles', 'Tackles Def 3rd %'),
        ('Tackles', 'Mid 3rd') : ('Tackles', 'Tackles Mid 3rd %'),
        ('Pressures', 'Press') : ('Pressures', 'Pressures Attempted'),
        ('Pressures', 'Def 3rd') : ('Pressures', 'Pressures Def 3rd %'),
        ('Pressures', 'Mid 3rd') : ('Pressures', 'Pressures Mid 3rd %'),
        ('Unnamed: 24_level_0', 'Int') : ('Blocks', 'Interceptions'),
        ('Unnamed: 26_level_0', 'Clr')  :('Blocks', 'Clearances')
    }

team_passtypes_colnames = {
    'TB' : 'Through Balls',
    'Press' : 'Passes Made Under Pressure',
    'Sw' : 'Passes 40 Yards Of Width+',
    'Crs' : 'Crosses',
    'Ground' : 'Ground Passes',
    'Low' : 'Low Passes',
}

team_possession_colnames = {
        ('Touches', 'Touches') : ('Touches', 'Total Touches'),
        ('Touches', 'Def Pen') : ('Touches', 'Defensive Penalty Touches %'),
        ('Touches', 'Def 3rd') : ('Touches', 'Defensive 3rd Touches %'),
        ('Touches', 'Mid 3rd') : ('Touches', 'Middle 3rd Touches %'),
        ('Touches', 'Att 3rd') : ('Touches', 'Attacking 3rd Touches %'),
        ('Dribbles', 'Att') : ('Dribbles', 'Total Dribbles Attempted'),
        ('Carries', 'Carries') : ('Carries', 'Total Carries'),
        ('Carries', 'Prog') : ('Carries', 'Progressive Carries %'),
        ('Carries', '1/3') : ('Carries', 'Carries Into Final Third %'),
        ('Carries', 'CPA') : ('Carries', 'Carries Into 18 Yard Box %')
    }

In [86]:
col_rename = [team_advgk_colnames, team_passing_colnames, team_defensive_colnames,
              team_passtypes_colnames, team_possession_colnames]

In [17]:
#transform absolute to percentage value columns
pass_columns = ['Short Passes%', 'Medium Passes%']
defensive_columns = ['Tackles Def 3rd %', 'Tackles Mid 3rd %', 'Pressures Def 3rd %', 'Pressures Mid 3rd %']
pass_type_columns = ['Ground Passes', 'Low Passes']
possession_columns = ['Defensive Penalty Touches %', 'Defensive 3rd Touches %', 'Middle 3rd Touches %',
                     'Attacking 3rd Touches %', 'Progressive Carries %', 'Carries Into Final Third %',
                     'Carries Into 18 Yard Box %']

In [101]:
transform_list = [pass_columns, defensive_columns, pass_type_columns, possession_columns]

In [19]:
def scrape_url(entity, season, metric):
    if season == '2021-2022':
        df = pd.read_html(f'https://fbref.com/en/comps/Big5/{metric}/{entity}/Big-5-European-Leagues-Stats')
    else:
        df = pd.read_html(f'https://fbref.com/en/comps/Big5/{season}/{metric}/{entity}/{season}-Big-5-European-Leagues-Stats')
    return df

In [33]:
def home_away_df(func, season, metric):
    df = scrape_url('squads', season, metric)
    df_for, df_against = df[0], df[1]
    df_for = func(df_for, 'For', season)
    df_against = func(df_against, 'Against', season)
    combined_df = df_for.merge(df_against)
    return combined_df

In [21]:
def clean_standard_stats(df, df_type, season):
    df.drop('Expected', level=0, axis=1, inplace=True)
    df.columns = df.columns.droplevel(0)
    df = df.loc[:, ['Squad', 'Comp', '90s', 'Poss', 'npxG']]
    if df_type == 'Against':
        df['Squad'] = df['Squad'].str.split(' ').apply(lambda x: x[1] if (len(x) == 2) else x[1] + ' ' + x[2]) + f'({season})'
        df.drop(['Poss', 'Comp'], axis=1, inplace=True)
        df.rename({'npxG':'npxG Against'}, axis=1, inplace=True)
    else:
        df['Squad'] = df['Squad'] + f'({season})'
        df.rename({'Poss':'Possession', 'npxG':'npxG For'}, axis=1, inplace=True)
    return df

In [23]:
def clean_shooting_stats(df, df_type, season):
    df.columns = df.columns.droplevel(0)
    df = df.loc[:, ['Squad', 'Comp', '90s', 'Sh/90']]
    if df_type == 'Against':
        df['Squad'] = df['Squad'].str.split(' ').apply(lambda x: x[1] if (len(x) == 2) else x[1] + ' ' + x[2]) + f'({season})'
        df.rename({'Sh/90': 'Shots Against/90'}, axis=1, inplace=True)
    else:
        df['Squad'] = df['Squad'] + f'({season})'
        df.rename({'Sh/90': 'Shots For/90'}, axis=1, inplace=True)
    return df

In [24]:
clean_funcs = [clean_standard_stats, clean_shooting_stats]

In [378]:
def clean_columns(entity, season, metric, col_names, trans_col=None, multi_column=False):
    if entity == 'squads':
        final_cols = ['Squad', 'Comp', '90s']
    else:
        final_cols = ['Player', 'Pos', 'Squad', 'Comp', 'Age', '90s']
    df = scrape_url(entity, season, metric)
    data = df[0]
    if multi_column:
        dict_values = list(col_names.values())
        dict_values = [dict_values[i][1] for i in range(len(dict_values))]
        final_cols.extend(dict_values)
        data.columns = data.columns.values
        data.columns = pd.MultiIndex.from_tuples(data.rename(columns=col_names))
        data.columns = data.columns.droplevel(0)
    else:
        dict_values = list(col_names.values())
        final_cols.extend(dict_values)
        data.columns = data.columns.droplevel(0)
        data.rename(col_names, axis=1, inplace=True)
    df_range = data.shape[0]
    if entity == 'players':
        for i in range(25, df_range, 26):
            data.drop(i, axis=0, inplace=True)
        for col in dict_values:
            data[col] = pd.to_numeric(data[col])
        data['Player'] = data['Player'] + f'({season})'
    if trans_col is not None:
        data = transform_columns(trans_col, data, metric)
    data = data.loc[:, final_cols]
    data['Squad'] = data['Squad'] + f'({season})' 
    return data

In [359]:
def transform_columns(columns, data, metric):
    if metric == 'passing':
        base = 'Total Passes Attempted'
    elif metric == 'passing_types':
        base = 'Att'
        try:
            data['Att'] = pd.to_numeric(data['Att'])
        except:
            pass
            
    for col in columns:
        if 'Tackles' in col:
            data[col] = data[col]/data['Tackles Attempted'] * 100
        elif 'Pressures' in col:
            data[col] = data[col]/data['Pressures Attempted'] * 100
        elif 'Touches' in col:
            data[col] = data[col]/data['Total Touches'] * 100
        elif 'Carries' in col:
            data[col] = data[col]/data['Total Carries'] * 100
        else:
            data[col] = data[col]/data[base] * 100
    return data

In [171]:
#combining all the tables
def combine_team_tables(seasons, metrics_func, metrics, col_rename, transform_list):
    final_df = pd.DataFrame()
    for idx, metric in enumerate(metrics):
        temp_df = pd.DataFrame()
        for season in seasons:
            if idx < 2:
                season_df = home_away_df(metrics_func[idx], season, metric)
            else:
                if metric == 'keepersadv':
                    trans_col = None
                else:
                    trans_col = transform_list[idx-3]
                if metric == 'passing_types':
                    multi_column = False
                else:
                    multi_column = True
                season_df = clean_columns('squads', season, metric, col_rename[idx-2], trans_col, multi_column)
            temp_df = pd.concat([temp_df, season_df])
        if final_df.empty:
            final_df = pd.concat([final_df, temp_df])
        else:
            final_df = final_df.merge(temp_df)
    return final_df

In [114]:
team_df = combine_tables(seasons, clean_funcs, team_metrics, col_rename, transform_list)

In [157]:
team_df2 = team_df.copy()

In [115]:
team_df.shape

(490, 52)

In [609]:
player_standstats_colnames = {
    ('Per 90 Minutes', 'xA') : ('Per 90 Minutes', 'xA/90'),
    ('Per 90 Minutes', 'npxG') : ('Per 90 Minutes', 'npxG/90'),
}

player_shooting_colnames = {
    'Sh/90' : 'Shots/90',
    'npxG/Sh' : 'npxG Per Shot'
}

player_passing_colnames = {
    ('Total', 'Att') : ('Total', 'Total Passes Attempted'),
    ('Total', 'Cmp%') : ('Total', 'Total Pass Completion%'),
    ('Short', 'Att') : ('Short', 'Short Passes%'),
    ('Short', 'Cmp%') : ('Short', 'Short Pass Completion%'),
    ('Medium', 'Att') : ('Medium', 'Medium Passes%'),
    ('Medium', 'Cmp%') : ('Medium', 'Medium Pass Completion%'),
    ('Long', 'Cmp%') : ('Long', 'Long Pass Completion%'),
    ('Unnamed: 26_level_0', 'KP') : ('Unnamed: 26_level_0', 'Key Passes'),
    ('Unnamed: 27_level_0', '1/3'): ('Unnamed: 27_level_0', 'Passes Into Final Third'),
    ('Unnamed: 28_level_0', 'PPA') : ('Unnamed: 28_level_0', 'Completed Passes 18 Yard Box'),
    ('Unnamed: 29_level_0', 'CrsPA') : ('Unnamed: 29_level_0', 'Completed Crosses 18 Yard Box'),
    ('Unnamed: 30_level_0', 'Prog') : ('Unnamed: 30_level_0', 'Progressive Passes')
}

player_passtypes_colnames = {
    'TB' : 'Through Balls',
    'Press' : 'Passes Made Under Pressure',
    'Sw' : 'Passes 40 Yards Of Width+',
    'Left': 'Left Foot',
    'Right': 'Right Foot',
    'Crs' : 'Crosses',
    'Ground' : 'Ground Passes',
    'Low' : 'Low Passes',
}

player_sca_colname = {'SCA90' : 'SCA/90'}

player_defensive_colnames = {
    ('Tackles', 'Tkl') : ('Tackles', 'Tackles Attempted'),
    ('Tackles', 'TklW') : ('Tackles', 'Tackles Won %'),
    ('Tackles', 'Def 3rd') : ('Tackles', 'Tackles Def 3rd %'),
    ('Tackles', 'Mid 3rd') : ('Tackles', 'Tackles Mid 3rd %'),
    ('Pressures', 'Press') : ('Pressures', 'Pressures Attempted'),
    ('Pressures', '%') : ('Pressures', 'Pressure Success %'),
    ('Pressures', 'Def 3rd') : ('Pressures', 'Pressures Def 3rd %'),
    ('Pressures', 'Mid 3rd') : ('Pressures', 'Pressures Mid 3rd %'),
    ('Unnamed: 28_level_0', 'Int') : ('Blocks', 'Interceptions'),
    ('Unnamed: 30_level_0', 'Clr')  :('Blocks', 'Clearances')
    }

player_possession_colnames = {
    ('Touches', 'Touches') : ('Touches', 'Total Touches'),
    ('Touches', 'Def Pen') : ('Touches', 'Defensive Penalty Touches %'),
    ('Touches', 'Def 3rd') : ('Touches', 'Defensive 3rd Touches %'),
    ('Touches', 'Mid 3rd') : ('Touches', 'Middle 3rd Touches %'),
    ('Touches', 'Att 3rd') : ('Touches', 'Attacking 3rd Touches %'),
    ('Dribbles', 'Att') : ('Dribbles', 'Total Dribbles Attempted'),
    ('Dribbles', 'Succ%') : ('Dribbles', 'Dribbles Success %'),
    ('Carries', 'Carries') : ('Carries', 'Total Carries'),
    ('Carries', 'Prog') : ('Carries', 'Progressive Carries %'),
    ('Carries', '1/3') : ('Carries', 'Carries Into Final Third %'),
    ('Carries', 'CPA') : ('Carries', 'Carries Into 18 Yard Box %'),
    ('Receiving', 'Rec') : ('Receiving', 'Total Passes Received'),
    ('Receiving', 'Prog') : ('Receiving', 'Total Progressive Passes Received'),
    }

player_misc_colnames = {
    ('Performance', 'Recov') : ('Performance', 'Recoveries'),
    ('Aerial Duels', 'Won%') : ('Aerial Duels', 'Aerial Duel Win %'),
}

In [610]:
player_col_rename = [player_standstats_colnames, player_shooting_colnames, player_passing_colnames, player_defensive_colnames,
                     player_passtypes_colnames, player_possession_colnames, player_sca_colname, player_misc_colnames]

In [382]:
player_gk_colnames = {('Performance', 'Save%') : ('Performance', 'Shot Save%')}

player_advgk_colnames = {
    ('Expected', 'PSxG/SoT') : ('Expected', 'psxG/soT'),
    ('Expected', '/90') : ('Expected', 'psxG+/- Per 90'),
    ('Launched', 'Att') : ('Launched', 'Launches Attempted'),
    ('Launched', 'Cmp%') : ('Launched', 'Launch Completion%'),
    ('Passes', 'AvgLen'): ('Passes', 'AvgLenGKPass'),
    ('Passes', 'Att'): ('Passes', 'GK Passes Attempted'),
    ('Passes', 'Launch%'): ('Passes', 'GK Passes Launch%'),
    ('Goal Kicks', 'Launch%'): ('Goal Kicks', 'GoalKick Launch%'),
    ('Goal Kicks', 'AvgLen'): ('Goal Kicks', 'AvgLenGoalKick'),
    ('Crosses', 'Stp%'): ('Crosses', 'Crosses Stopped%'),
    ('Sweeper', '#OPA/90'): ('Sweeper', 'Defensive Actions Outside Penalty Box'),
}

In [383]:
gk_col_rename = [player_gk_colnames, player_advgk_colnames]

In [611]:
def combine_player_tables(seasons, metrics, col_rename, transform_list):
    final_df = pd.DataFrame()
    transform_col = ['passing', 'passing_types', 'defense', 'possession']
    not_multicol = ['shooting', 'passing_types', 'gca']
    for idx, metric in enumerate(metrics):
        temp_df = pd.DataFrame()
        for season in seasons:
            if metric not in transform_col:
                trans_col = None
            else:
                trans_col = transform_list[idx-6]
            if metric in not_multicol:
                multi_column = False
            else:
                multi_column = True
            season_df = clean_columns('players', season, metric, col_rename[idx], trans_col, multi_column)
            temp_df = pd.concat([temp_df, season_df])
        if final_df.empty:
            final_df = pd.concat([final_df, temp_df])
        else:
            final_df = final_df.merge(temp_df)
    return final_df

In [612]:
outfield_df = combine_player_tables(seasons, player_metrics, player_col_rename, transform_list)

In [616]:
combine_df = outfield_df.merge(gk_df, how='outer')

In [617]:
combine_df.shape

(13720, 68)

In [618]:
combine_df.head()

Unnamed: 0,Player,Pos,Squad,Comp,Age,90s,xA/90,npxG/90,Shots/90,npxG Per Shot,...,psxG+/- Per 90,Launches Attempted,Launch Completion%,AvgLenGKPass,GK Passes Attempted,GK Passes Launch%,GoalKick Launch%,AvgLenGoalKick,Crosses Stopped%,Defensive Actions Outside Penalty Box
0,Patrick van Aanholt(2017-2018),DF,Crystal Palace(2017-2018),eng Premier League,26,24.3,0.07,0.14,1.36,0.1,...,,,,,,,,,,
1,Rolando Aarons(2017-2018),"FW,MF",Newcastle Utd(2017-2018),eng Premier League,21,1.5,0.0,0.1,1.29,0.08,...,,,,,,,,,,
2,Rolando Aarons(2017-2018),"MF,FW",Hellas Verona(2017-2018),it Serie A,21,5.7,0.04,0.04,0.52,0.07,...,,,,,,,,,,
3,Ignazio Abate(2017-2018),DF,Milan(2017-2018),it Serie A,30,11.7,0.05,0.02,0.34,0.07,...,,,,,,,,,,
4,Aymen Abdennour(2017-2018),DF,Marseille(2017-2018),fr Ligue 1,27,5.6,0.0,0.02,0.36,0.04,...,,,,,,,,,,


In [384]:
gk_df = combine_player_tables(seasons, gk_metrics, gk_col_rename, transform_list)

In [620]:
combine_df = combine_df.fillna(0)

In [621]:
player_df = combine_df.copy()

In [951]:
player_df['90s'] = pd.to_numeric(player_df['90s'])

In [952]:
player_df = player_df[player_df['90s'] > 9]

In [454]:
team_df2 = team_df.copy()

In [456]:
abs_columns = ['GK Passes Attempted', 'Opponent Crosses Attempted', 'Total Passes Attempted', 'Key Passes',
                    'Passes Into Final Third', 'Completed Passes 18 Yard Box', 'Completed Crosses 18 Yard Box',
                    'Progressive Passes', 'Through Balls', 'Passes Made Under Pressure', 'Passes 40 Yards Of Width+',
                    'Crosses', 'Tackles Attempted', 'Pressures Attempted', 'Interceptions', 'Clearances', 'Total Touches',
                    'Total Dribbles Attempted', 'Total Carries']
#all absolute values to per 90 values
for col in abs_columns:
    team_df2[col] = team_df2[col]/team_df2['90s']

In [625]:
abs_columns2 = ['Total Passes Attempted', 'Key Passes', 'Passes Into Final Third',
                'Completed Passes 18 Yard Box', 'Completed Crosses 18 Yard Box', 'Progressive Passes', 
                'Through Balls', 'Passes Made Under Pressure', 'Passes 40 Yards Of Width+', 'Crosses', 'Tackles Attempted', 
                'Pressures Attempted', 'Interceptions', 'Clearances', 'Total Touches', 'Total Dribbles Attempted', 
                'Total Carries', 'Launches Attempted', 'GK Passes Attempted']
#all absolute values to per 90 values
for col in abs_columns2:
    player_df[col] = player_df[col]/player_df['90s']

In [627]:
player_df.shape

(7976, 68)

In [470]:
correlation_features = team_df2.iloc[:, 3:]

In [472]:
correlation_features = correlation_features.corr()

In [473]:
corr_columns = correlation_features.columns

In [474]:
#highly correlated features
high_corr = []
for c1 in corr_columns:
    for c2 in corr_columns:
        if c1 != c2 and c2 not in high_corr and correlation_features[c1][c2] > 0.9:
            high_corr.append((c1, c2))

In [475]:
high_corr

[('Possession', 'Total Passes Attempted'),
 ('Possession', 'Total Touches'),
 ('Possession', 'Total Carries'),
 ('Shots For/90', 'Key Passes'),
 ('AvgLenGKPass', 'GK Passes Launch%'),
 ('GK Passes Launch%', 'AvgLenGKPass'),
 ('GoalKick Launch%', 'AvgLenGoalKick'),
 ('AvgLenGoalKick', 'GoalKick Launch%'),
 ('Total Passes Attempted', 'Possession'),
 ('Total Passes Attempted', 'Passes Into Final Third'),
 ('Total Passes Attempted', 'Total Touches'),
 ('Total Passes Attempted', 'Total Carries'),
 ('Total Pass Completion%', 'Short Pass Completion%'),
 ('Total Pass Completion%', 'Medium Pass Completion%'),
 ('Total Pass Completion%', 'Long Pass Completion%'),
 ('Total Pass Completion%', 'Ground Passes'),
 ('Total Pass Completion%', 'Total Carries'),
 ('Short Pass Completion%', 'Total Pass Completion%'),
 ('Medium Pass Completion%', 'Total Pass Completion%'),
 ('Medium Pass Completion%', 'Ground Passes'),
 ('Long Pass Completion%', 'Total Pass Completion%'),
 ('Key Passes', 'Shots For/90'),
 

In [478]:
reduntant_columns = ['90s', 'Total Passes Attempted', 'Total Touches' , 'Total Pass Completion%',
                    'AvgLenGKPass', 'AvgLenGoalKick']

In [479]:
team_df2.drop(reduntant_columns, axis=1, inplace=True)

In [629]:
player_correlation = player_df.iloc[:, 6:]

In [630]:
player_correlation = player_correlation.corr()

In [631]:
player_corr = player_correlation.columns

In [632]:
high_corr = []
for c1 in player_corr:
    for c2 in player_corr:
        if c1 != c2 and c2 not in high_corr and player_correlation[c1][c2] > 0.9:
            high_corr.append((c1, c2))

In [633]:
high_corr

[('xA/90', 'Key Passes'),
 ('Total Passes Attempted', 'Total Touches'),
 ('Short Pass Completion%', 'Medium Pass Completion%'),
 ('Medium Pass Completion%', 'Short Pass Completion%'),
 ('Key Passes', 'xA/90'),
 ('Key Passes', 'SCA/90'),
 ('Total Touches', 'Total Passes Attempted'),
 ('Total Touches', 'Total Carries'),
 ('Total Carries', 'Total Touches'),
 ('SCA/90', 'Key Passes'),
 ('Shot Save%', 'psxG/soT'),
 ('Shot Save%', 'Launches Attempted'),
 ('Shot Save%', 'Launch Completion%'),
 ('Shot Save%', 'AvgLenGKPass'),
 ('Shot Save%', 'GK Passes Attempted'),
 ('Shot Save%', 'GK Passes Launch%'),
 ('Shot Save%', 'GoalKick Launch%'),
 ('Shot Save%', 'AvgLenGoalKick'),
 ('Shot Save%', 'Crosses Stopped%'),
 ('psxG/soT', 'Shot Save%'),
 ('psxG/soT', 'Launches Attempted'),
 ('psxG/soT', 'Launch Completion%'),
 ('psxG/soT', 'AvgLenGKPass'),
 ('psxG/soT', 'GK Passes Attempted'),
 ('psxG/soT', 'GK Passes Launch%'),
 ('psxG/soT', 'GoalKick Launch%'),
 ('psxG/soT', 'AvgLenGoalKick'),
 ('psxG/soT',

In [634]:
reduntant_columns = ['Total Touches', 'Key Passes', 'AvgLenGKPass', 'AvgLenGoalKick']

In [635]:
player_df.drop(reduntant_columns, axis=1, inplace=True)

In [639]:
player_df['Left Foot'] = pd.to_numeric(player_df['Left Foot'])

In [640]:
player_df['Right Foot'] = pd.to_numeric(player_df['Right Foot'])

In [675]:
player_df.reset_index(inplace=True, drop=True)

In [676]:
foot = []

for i in range(len(player_df)):
    # ratio of left to right foot passes
    val = player_df['Left Foot'][i]/player_df['Right Foot'][i]
    if val>1:
        foot.append('left')
    else:
        foot.append('right')

# adding to the data frame        
player_df['Foot'] = foot

  val = player_df['Left Foot'][i]/player_df['Right Foot'][i]


In [647]:
player_df.columns

Index(['Player', 'Pos', 'Squad', 'Comp', 'Age', '90s', 'xA/90', 'npxG/90',
       'Shots/90', 'npxG Per Shot', 'Total Passes Attempted',
       'Total Pass Completion%', 'Short Passes%', 'Short Pass Completion%',
       'Medium Passes%', 'Medium Pass Completion%', 'Long Pass Completion%',
       'Passes Into Final Third', 'Completed Passes 18 Yard Box',
       'Completed Crosses 18 Yard Box', 'Progressive Passes',
       'Tackles Attempted', 'Tackles Won %', 'Tackles Def 3rd %',
       'Tackles Mid 3rd %', 'Pressures Attempted', 'Pressure Success %',
       'Pressures Def 3rd %', 'Pressures Mid 3rd %', 'Interceptions',
       'Clearances', 'Through Balls', 'Passes Made Under Pressure',
       'Passes 40 Yards Of Width+', 'Left Foot', 'Right Foot', 'Crosses',
       'Ground Passes', 'Low Passes', 'Defensive Penalty Touches %',
       'Defensive 3rd Touches %', 'Middle 3rd Touches %',
       'Attacking 3rd Touches %', 'Total Dribbles Attempted',
       'Dribbles Success %', 'Total Carrie

In [648]:
player_df.drop(['Pos', 'Squad', 'Comp', 'Age', '90s', 'Left Foot', 'Right Foot'], axis=1, inplace=True)

In [649]:
player_df.set_index('Player', inplace=True)

In [650]:
player_df.head()

Unnamed: 0_level_0,xA/90,npxG/90,Shots/90,npxG Per Shot,Total Passes Attempted,Total Pass Completion%,Short Passes%,Short Pass Completion%,Medium Passes%,Medium Pass Completion%,...,psxG/soT,psxG+/- Per 90,Launches Attempted,Launch Completion%,GK Passes Attempted,GK Passes Launch%,GoalKick Launch%,Crosses Stopped%,Defensive Actions Outside Penalty Box,Foot
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Patrick van Aanholt(2017-2018),0.07,0.14,1.36,0.1,47.613169,76.7,37.856525,91.6,42.264477,79.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,left
Ignazio Abate(2017-2018),0.05,0.02,0.34,0.07,66.153846,80.7,36.175711,91.1,43.540052,87.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,right
Mehdi Abeid(2017-2018),0.02,0.08,1.61,0.05,46.641221,84.9,32.07856,90.3,45.99018,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,right
David Abraham(2017-2018),0.01,0.02,0.47,0.04,51.289062,82.3,20.944402,87.6,49.885758,89.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,right
Tammy Abraham(2017-2018),0.05,0.35,2.19,0.16,15.885417,71.1,50.491803,79.2,29.836066,69.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,right


In [538]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [539]:
from sklearn.metrics.pairwise import cosine_similarity

In [580]:
def scale_features(entity, df1, df2):
    x = StandardScaler().fit_transform(df2)
    X = pd.DataFrame(x, columns=df2.columns, index=df1.index)
    dj = pd.DataFrame(cosine_similarity(X, dense_output=True))
    dict_cols = {ind:entity for ind, entity in enumerate(df1.index.values)}
    dj.rename(dict_cols, axis=1, inplace=True)
    dj.rename(dict_cols, axis=0, inplace=True)
    dj.to_csv(f'{entity}_similarity.csv', index=False)
    return dj

In [541]:
team_df2.set_index('Squad', inplace=True)

In [581]:
df1 = scale_features('squad', team_df2, team_df3)

In [653]:
player_df2 = player_df.copy()

In [655]:
player_df2.drop('Foot', axis=1, inplace=True)

In [657]:
df2 = scale_features('player', player_df2, player_df2)

In [659]:
df2.head()

Unnamed: 0,Patrick van Aanholt(2017-2018),Ignazio Abate(2017-2018),Mehdi Abeid(2017-2018),David Abraham(2017-2018),Tammy Abraham(2017-2018),Amir Abrashi(2017-2018),Francesco Acerbi(2017-2018),Afriyie Acquah(2017-2018),Antonio Adán(2017-2018),Aday(2017-2018),...,Łukasz Skorupski(2021-2022),Yann Sommer(2021-2022),David Soria(2021-2022),Wojciech Szczęsny(2021-2022),Marc-André ter Stegen(2021-2022),Pietro Terracciano(2021-2022),Kevin Trapp(2021-2022),Rúben Vezo(2021-2022),Guglielmo Vicario(2021-2022),Robin Zentner(2021-2022)
Patrick van Aanholt(2017-2018),1.0,0.138007,-0.19628,0.205657,-0.252138,-0.039314,0.246473,-0.329915,-0.1821,0.316612,...,-0.265739,-0.26561,-0.278273,-0.259235,-0.261552,-0.269843,-0.26837,-0.219077,-0.263692,-0.267707
Ignazio Abate(2017-2018),0.138007,1.0,0.076912,-0.034495,-0.274347,0.126131,-0.129756,0.141193,-0.194603,0.44177,...,-0.257788,-0.25341,-0.260285,-0.254236,-0.254304,-0.254596,-0.256538,-0.223243,-0.256088,-0.257035
Mehdi Abeid(2017-2018),-0.19628,0.076912,1.0,0.273215,-0.112029,0.329544,0.123497,0.606994,-0.0985,-0.340092,...,-0.33476,-0.328632,-0.336995,-0.330436,-0.330297,-0.329867,-0.332741,-0.29102,-0.332583,-0.333505
David Abraham(2017-2018),0.205657,-0.034495,0.273215,1.0,-0.59001,-0.005522,0.871827,-0.14359,0.111829,-0.09228,...,-0.218303,-0.214911,-0.221133,-0.215091,-0.21531,-0.216131,-0.217526,-0.188255,-0.216842,-0.217862
Tammy Abraham(2017-2018),-0.252138,-0.274347,-0.112029,-0.59001,1.0,0.047809,-0.430771,0.207143,-0.170959,-0.093324,...,0.183303,0.161334,0.142236,0.193006,0.183387,0.149225,0.165557,0.206277,0.183362,0.170963


In [677]:
player_df['Comp'] = player_df['Comp'].apply(lambda x: x.split(' ')[1] + x.split(' ')[2] if (len(x.split(' ')) == 3) else x.split(' ')[1])

In [678]:
player_df['Comp'].unique()

array(['PremierLeague', 'SerieA', 'Ligue1', 'Bundesliga', 'LaLiga'],
      dtype=object)

In [693]:
player_df['Player'].str.split('(')[0][1][:-1]

'2017-2018'

In [694]:
player_df['Season'] = player_df['Player'].apply(lambda x: x.split('(')[1][:-1])

In [713]:
player_df['Age'] = pd.to_numeric(player_df['Age'])

In [711]:
player_df['Age'] = player_df['Age'].apply(lambda x: x.split('-')[0] if (len(x.split('-')) == 2) else x)

In [739]:
df1.to_csv('team_similarity.csv')

In [862]:
trial_df = player_df.copy()   

In [902]:
trial_df['Squad_2'] = trial_df['Squad'].apply(lambda x: x.split('(')[0])

In [904]:
trial_df['Player'] = [x + f'({y})' + f'({z})' for x, y, z in zip(trial_df['Player'], trial_df['Squad_2'], trial_df['Season'])]

In [948]:
trial_df.to_csv(r'C:\Users\shivank\Desktop\Player Recommendation Tool\player-recommendation\data\player_stats.csv', index=False)