ALL IMPORTS FOR THIS PROJECT

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

Extracting defenders and their important information

In [45]:

def cleanDefenders(csv_path="Players-DataSet.csv"):
    df = pd.read_csv(csv_path)
    df_df = df[df['Pos'].str.contains('DF', na=False)].copy()
    df_df = df_df[df_df['90s'] >= 5.0].copy()
    identifier_cols = [
        'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'MP', 'Starts', 'Min', '90s'
    ]
    passing_cols = [
        'Ast',      # Assists
        'xAG',      # Expected Assisted Goals
        'KP',       # Key Passes
        'PrgP',     # Progressive Passes
        'TB',       # Through Balls
        'Crs',      # Crosses
        'CK',       # Corner Kicks
    ]
    possession_cols = [
        'Succ',     # Successful Dribbles
        'Att_stats_possession', # Dribbles Attempted
        'PrgR',     # Progressive Passes Received
        'SCA90',    # Shot Creating actions 
        'GCA90'     # Goal Creating actions
    ]
    defensive_cols = [
        'Tkl',      # Tackles
        'TklW',     # Tackles Won
        'Int',      # Interceptions
        'Blocks_stats_defense', # Blocks (defensive)
        'Clr',      # Clearances
        'Err'       # Errors
    ]
    all_relevant_cols = identifier_cols + passing_cols + possession_cols  + defensive_cols   
    df_df = df_df[all_relevant_cols].copy()
    numeric_stat_cols = [
        'Ast', 'xAG', 'KP', 'PrgP','TB', 'Crs', 'CK'
       'Succ', 'Att_stats_possession', 'PrgR', 'SCA90', 'GCA90',
        'Tkl', 'TklW', 'Int', 'Blocks_stats_defense', 'Clr', 'Err'
    ]
    for col in numeric_stat_cols:
        if col in df_df.columns:
            df_df[col] = df_df[col].fillna(0)
    cols_to_normalize = [
        'Ast', 'xAG', 'KP', 'PrgP',  'TB', 'Crs', 'CK', 
        'Succ', 'Att_stats_possession', 'PrgR',
        'Tkl', 'TklW', 'Int', 'Blocks_stats_defense', 'Clr', 'Err'
    ]
    for col in cols_to_normalize:
        if col in df_df.columns:
            df_df[f'{col}_p90'] = (df_df[col] / df_df['90s']).round(3)
    df_df.replace([np.inf, -np.inf], 0, inplace=True)
    print(f"Original DataFrame size: {df.shape}")
    print(f"New Defender DataFrame size: {df_df.shape}")
    
    return df_df

defenders_df = cleanDefenders()
defenders_df.describe()
defenders_df.to_csv("Defenders.csv", index=False)


Original DataFrame size: (2854, 267)
New Defender DataFrame size: (828, 44)


Cleaning Midfielders

In [41]:

def clean_midfielder_data(csv_path="Players-DataSet.csv"):
    df = pd.read_csv(csv_path)
    mf_df = df[df['Pos'].str.contains('MF', na=False)].copy()
    mf_df = mf_df[mf_df['90s'] >= 5.0].copy()
    identifier_cols = [
        'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'MP', 'Starts', 'Min', '90s'
    ]
    passing_cols = [
        'Ast',      # Assists
        'xAG',      # Expected Assisted Goals
        'KP',       # Key Passes
        'PrgP',     # Progressive Passes
        'Cmp',      # Passes Completed
        'Att',      # Passes Attempted
        'Cmp%',     # Pass Completion %
        'TB',       # Through Balls
        'Crs',      # Crosses
        'CK',       # Corner Kicks
        '1/3',      # passes into the final third
        'PPA',      # passes into pen area
    ]
    possession_cols = [
        'Touches',  # Total Touches
        'PrgC',     # Progressive Carries
        'Succ',     # Successful Dribbles
        'Att_stats_possession', # Dribbles Attempted
        'PrgR',     # Progressive Passes Received
        'SCA90',    # Shot Creating actions 
        'GCA90'     # Goal Creating actions
    ]
    shooting_cols = [
        'Gls',      # Goals
        'xG',       # Expected Goals
        'Sh',       # Shots 
        'SoT',      # Shots on Target 
        'G/Sh',     # Goals per Shot 
        'G/SoT',    # Goals per Shot on Target 
    ]
    defensive_cols = [
        'Tkl',      # Tackles
        'TklW',     # Tackles Won
        'Int',      # Interceptions
        'Blocks_stats_defense', # Blocks (defensive)
        'Clr',      # Clearances
        'Err'       # Errors
    ]
    all_relevant_cols = identifier_cols + passing_cols + possession_cols + shooting_cols + defensive_cols   
    mf_df = mf_df[all_relevant_cols].copy()
    numeric_stat_cols = [
        'Ast', 'xAG', 'KP', 'PrgP', 'Cmp', 'Att', 'TB', 'Crs', 'CK', '1/3', 'PPA',
        'Touches', 'PrgC', 'Succ', 'Att_stats_possession', 'PrgR', 'SCA90', 'GCA90',
        'Gls', 'xG', 'Sh', 'SoT', 'G/Sh', 'G/SoT',
        'Tkl', 'TklW', 'Int', 'Blocks_stats_defense', 'Clr', 'Err'
    ]
    
    for col in numeric_stat_cols:
        if col in mf_df.columns:
            mf_df[col] = mf_df[col].fillna(0)
    cols_to_normalize = [
        'Ast', 'xAG', 'KP', 'PrgP', 'Cmp', 'Att', 'TB', 'Crs', 'CK', '1/3', 'PPA',
        'Touches', 'PrgC', 'Succ', 'Att_stats_possession', 'PrgR',
        'Gls', 'xG', 'Sh', 'SoT',
        'Tkl', 'TklW', 'Int', 'Blocks_stats_defense', 'Clr', 'Err'
    ]
    for col in cols_to_normalize:
        if col in mf_df.columns:
            mf_df[f'{col}_p90'] = (mf_df[col] / mf_df['90s']).round(3)
    mf_df.replace([np.inf, -np.inf], 0, inplace=True)
    print(f"Original DataFrame size: {df.shape}")
    print(f"New Midfielder DataFrame size: {mf_df.shape}")
    
    return mf_df

midfielders_df = clean_midfielder_data()
midfielders_df.describe()
midfielders_df.to_csv("Midfielders.csv", index=False)


Original DataFrame size: (2854, 267)
New Midfielder DataFrame size: (908, 67)


CLEANING FORWARDS

In [43]:

def CleanForwards(csv_path="Players-DataSet.csv"):
    df = pd.read_csv(csv_path)
    FW_df = df[df['Pos'].str.contains('FW', na=False)].copy()
    FW_df = FW_df[FW_df['90s'] >= 5.0].copy()
    identifier_cols = [
        'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'MP', 'Starts', 'Min', '90s'
    ]
    passing_cols = [
        'Ast',      # Assists
        'xAG',      # Expected Assisted Goals
        'KP',       # Key Passes
        'PrgP',     # Progressive Passes
        'TB',       # Through Balls
        'Crs',      # Crosses
        'PPA',      # passes into pen area
    ]
    possession_cols = [
        'Succ',     # Successful Dribbles
             'PrgC',     # Progressive Carries  
        'Att_stats_possession', # Dribbles Attempted
        'PrgR',     # Progressive Passes Received
        'SCA90',    # Shot Creating actions 
        'GCA90'     # Goal Creating actions
    ]
    shooting_cols = [
        'Gls',      # Goals
        'xG',       # Expected Goals
        'Sh',       # Shots 
        'SoT',      # Shots on Target 
        'G/Sh',     # Goals per Shot 
        'G/SoT',    # Goals per Shot on Target 
    ]
    all_relevant_cols = identifier_cols + passing_cols + possession_cols + shooting_cols
    FW_df = FW_df[all_relevant_cols].copy()
    numeric_stat_cols = [
        'Ast', 'xAG', 'KP', 'PrgP','TB', 'Crs', 'PPA',
        'Succ', 'Att_stats_possession', 'PrgR', 'SCA90', 'GCA90',
        'Gls', 'xG', 'Sh', 'SoT', 'G/Sh', 'G/SoT',
    ]
    
    for col in numeric_stat_cols:
        if col in FW_df.columns:
            FW_df[col] = FW_df[col].fillna(0)
    cols_to_normalize = [
        'Ast', 'xAG', 'KP', 'PrgP', 'TB', 'Crs', 'PPA',
        'Succ', 'Att_stats_possession', 'PrgR','PrgC',
        'Gls', 'xG', 'Sh', 'SoT',
    ]
    for col in cols_to_normalize:
        if col in FW_df.columns:
            FW_df[f'{col}_p90'] = (FW_df[col] / FW_df['90s']).round(3)
    FW_df.replace([np.inf, -np.inf], 0, inplace=True)
    print(f"Original DataFrame size: {df.shape}")
    print(f"New Forward DataFrame size: {FW_df.shape}")
    
    return FW_df

midfielders_df = CleanForwards()
midfielders_df.describe()
midfielders_df.to_csv("Forwards.csv", index=False)


Original DataFrame size: (2854, 267)
New Forward DataFrame size: (649, 44)


CLEANING GOALKEEPERS

In [48]:
df=pd.read_csv("Players-DataSet.csv")
GK_df = df[df['Pos'].str.contains('GK', na=False)].copy()
GK_df = GK_df[GK_df['90s'] >= 5.0].copy()
identifier_cols = [
    'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'MP', 'Starts', 'Min', '90s'
]
Keeper_cols = [
    "Won%", "Rk_stats_keeper", "Nation_stats_keeper", "Pos_stats_keeper",
    "Comp_stats_keeper", "Age_stats_keeper", "Born_stats_keeper", "MP_stats_keeper",
    "Starts_stats_keeper", "Min_stats_keeper", "90s_stats_keeper", "GA", "GA90",
    "SoTA", "Saves", "Save%", "W", "D", "L", "CS", "CS%", "PKatt_stats_keeper",
    "PKA", "PKsv", "PKm", "Rk_stats_keeper_adv", "Nation_stats_keeper_adv",
    "Pos_stats_keeper_adv", "Comp_stats_keeper_adv", "Age_stats_keeper_adv",
    "Born_stats_keeper_adv", "90s_stats_keeper_adv", "GA_stats_keeper_adv",
    "PKA_stats_keeper_adv", "FK_stats_keeper_adv", "CK_stats_keeper_adv",
    "OG_stats_keeper_adv", "PSxG", "PSxG/SoT", "PSxG+/-", "/90",
    "Cmp_stats_keeper_adv", "Att_stats_keeper_adv", "Cmp%_stats_keeper_adv",
    "Att (GK)", "Thr", "Launch%", "AvgLen", "Opp", "Stp", "Stp%", "#OPA",
    "#OPA/90", "AvgDist"
]
Relevant_Cols = identifier_cols + Keeper_cols
GK_df = GK_df[Relevant_Cols].copy()
GK_df.to_csv("Goalkeepers.csv")

classifying defenders into center backs and fullbacks

In [46]:
df = pd.read_csv("Defenders.csv")
features = ['Crs_p90', 'Ast_p90' , 'KP_p90' , 'CK_p90', 'SCA90' , 'GCA90' , "PrgR_p90",'Att_stats_possession_p90' ]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])
kmeans = KMeans(n_clusters=2 , random_state=42, n_init=10)
df["Cluster"] = kmeans.fit_predict(X_scaled)
ClusterMean = df.groupby("Cluster")["PrgR_p90"].mean()
FBClusterID = ClusterMean.idxmax()   
df["SpecificPos"] = "Center Back"
df.loc[df['Cluster'] == FBClusterID , "SpecificPos"]= "Full Back"
col_to_move = df.pop('SpecificPos')
df.insert(df.columns.get_loc('Pos') + 1, 'SpecificPos', col_to_move)
df.to_csv("Defenders.csv" , index=False)

Classifying midfielders into CDM, CM and CAM

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

df=pd.read_csv("Midfielders.csv")
features=['Ast_p90','KP_p90','xAG_p90','SCA90','GCA90','Gls_p90','xG_p90','Sh_p90','SoT_p90','PrgP_p90','PrgC_p90','Touches_p90','1/3_p90','PPA_p90','Cmp_p90','Tkl_p90','TklW_p90','Int_p90','Blocks_stats_defense_p90','Clr_p90']
Scoring=['Gls_p90','xG_p90','Sh_p90','SoT_p90']
Passing=['PrgP_p90','Touches_p90','1/3_p90','PPA_p90','Cmp_p90']
Creating=['Ast_p90','KP_p90','xAG_p90','SCA90','GCA90','PrgC_p90']
Defense=['Tkl_p90','TklW_p90','Int_p90','Blocks_stats_defense_p90','Clr_p90']
scaler=StandardScaler()
X_scaled=scaler.fit_transform(df[features])
df_scaled=pd.DataFrame(X_scaled,columns=features,index=df.index)
CDMScore=3*df_scaled[Defense].mean(axis=1)+1*df_scaled[Passing].mean(axis=1)-1*df_scaled[Creating].mean(axis=1)-1*df_scaled[Scoring].mean(axis=1)
CAMScore=3*df_scaled[Creating].mean(axis=1)+2*df_scaled[Scoring].mean(axis=1)+1*df_scaled[Passing].mean(axis=1)-2*df_scaled[Defense].mean(axis=1)
CMScore=1.5*df_scaled[Passing].mean(axis=1)+1.5*df_scaled[Creating].mean(axis=1)+1*df_scaled[Defense].mean(axis=1)+1*df_scaled[Scoring].mean(axis=1)
scores=np.vstack([CDMScore,CMScore,CAMScore]).T
exp_scores=np.exp(scores-np.max(scores,axis=1,keepdims=True))
probs=exp_scores/exp_scores.sum(axis=1,keepdims=True)
roles=np.array(['CDM','CM','CAM'])
df['SpecificPos']=roles[np.argmax(probs,axis=1)]
cdm_p=probs[:,0]
cm_p=probs[:,1]
cam_p=probs[:,2]
ratio_cdm_cm=np.minimum(cdm_p,cm_p)/np.maximum(cdm_p,cm_p)
ratio_cm_cam=np.minimum(cm_p,cam_p)/np.maximum(cm_p,cam_p)
df.loc[(df['SpecificPos']=='CDM')&(ratio_cdm_cm>0.9),'SpecificPos']='CDM/CM'
df.loc[(df['SpecificPos']=='CM')&(ratio_cdm_cm>0.9),'SpecificPos']='CDM/CM'
df.loc[(df['SpecificPos']=='CM')&(ratio_cm_cam>0.9),'SpecificPos']='CM/CAM'
df.loc[(df['SpecificPos']=='CAM')&(ratio_cm_cam>0.9),'SpecificPos']='CM/CAM'
col=df.pop('SpecificPos')
df.insert(df.columns.get_loc('Pos')+1,'SpecificPos',col)
df.to_csv("Midfielders.csv",index=False)


classifying forwards as strikers and wingers


In [44]:
df=pd.read_csv("Forwards.csv")
features=['Ast_p90','KP_p90',"xAG_p90","SCA90","GCA90","Gls_p90","xG_p90","Sh_p90","SoT_p90","G/SoT","Crs_p90",'Succ_p90',"TB_p90","Att_stats_possession_p90",'PrgP_p90','PrgC_p90','PPA_p90',"PrgR_p90"]
Scoring=['Gls_p90','xG_p90','Sh_p90','SoT_p90','G/SoT','PrgR_p90']
Passing=['Crs_p90','Succ_p90','Att_stats_possession_p90','PrgC_p90']
Creating=['Ast_p90','xAG_p90','KP_p90','SCA90','GCA90','PPA_p90','TB_p90']
scaler=StandardScaler()
X_scaled=scaler.fit_transform(df[features])
df_scaled=pd.DataFrame(X_scaled,columns=features,index=df.index)
StrikerScore=3*df_scaled[Scoring].mean(axis=1)+0.5*df_scaled[Creating].mean(axis=1)-1.5*df_scaled[Passing].mean(axis=1)
WingerScore=3*df_scaled[Passing].mean(axis=1)+1.5*df_scaled[Creating].mean(axis=1)+0.5*df_scaled[Scoring].mean(axis=1)
df['SpecificPos']='STRIKER'
df.loc[WingerScore>StrikerScore,'SpecificPos']='WINGER'
ratio=np.minimum(StrikerScore,WingerScore)/np.maximum(StrikerScore,WingerScore)
df.loc[ratio>0.92,'SpecificPos']='WINGER/STRIKER'
col=df.pop('SpecificPos')
df.insert(df.columns.get_loc('Pos')+1,'SpecificPos',col)
df.to_csv("Forwards.csv",index=False)