ALL IMPORTS FOR THIS PROJECT

In [1]:
import numpy as np
import pandas as pd

Extracting defenders and their important information

In [9]:

def cleanDefenders(csv_path="Players-DataSet.csv"):
    df = pd.read_csv(csv_path)
    df_df = df[df['Pos'].str.contains('DF', na=False)].copy()
    df_df = df_df[df_df['90s'] >= 5.0].copy()
    identifier_cols = [
        'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'MP', 'Starts', 'Min', '90s'
    ]
    passing_cols = [
        'Ast',      # Assists
        'xAG',      # Expected Assisted Goals
        'KP',       # Key Passes
        'PrgP',     # Progressive Passes
        'TB',       # Through Balls
        'Crs',      # Crosses
        'CK',       # Corner Kicks
    ]
    possession_cols = [
        'Succ',     # Successful Dribbles
        'Att_stats_possession', # Dribbles Attempted
        'PrgR',     # Progressive Passes Received
        'SCA90',    # Shot Creating actions 
        'GCA90'     # Goal Creating actions
    ]
    defensive_cols = [
        'Tkl',      # Tackles
        'TklW',     # Tackles Won
        'Int',      # Interceptions
        'Blocks_stats_defense', # Blocks (defensive)
        'Clr',      # Clearances
        'Err'       # Errors
    ]
    all_relevant_cols = identifier_cols + passing_cols + possession_cols  + defensive_cols   
    df_df = df_df[all_relevant_cols].copy()
    numeric_stat_cols = [
        'Ast', 'xAG', 'KP', 'PrgP','TB', 'Crs', 'CK'
       'Succ', 'Att_stats_possession', 'PrgR', 'SCA90', 'GCA90',
        'Tkl', 'TklW', 'Int', 'Blocks_stats_defense', 'Clr', 'Err'
    ]
    for col in numeric_stat_cols:
        if col in df_df.columns:
            df_df[col] = df_df[col].fillna(0)
    cols_to_normalize = [
        'Ast', 'xAG', 'KP', 'PrgP',  'TB', 'Crs', 'CK', 
        'Succ', 'Att_stats_possession', 'PrgR',
        'Tkl', 'TklW', 'Int', 'Blocks_stats_defense', 'Clr', 'Err'
    ]
    for col in cols_to_normalize:
        if col in df_df.columns:
            df_df[f'{col}_p90'] = (df_df[col] / df_df['90s']).round(3)
    df_df.replace([np.inf, -np.inf], 0, inplace=True)
    print(f"Original DataFrame size: {df.shape}")
    print(f"New Defender DataFrame size: {df_df.shape}")
    
    return df_df

defenders_df = cleanDefenders()
defenders_df.describe()
defenders_df.to_csv("Processed-Defenders.csv", index=False)


Original DataFrame size: (2854, 267)
New Midfielder DataFrame size: (828, 44)


Cleaning Midfielders

In [6]:

def clean_midfielder_data(csv_path="Players-DataSet.csv"):
    df = pd.read_csv(csv_path)
    mf_df = df[df['Pos'].str.contains('MF', na=False)].copy()
    mf_df = mf_df[mf_df['90s'] >= 5.0].copy()
    identifier_cols = [
        'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'MP', 'Starts', 'Min', '90s'
    ]
    passing_cols = [
        'Ast',      # Assists
        'xAG',      # Expected Assisted Goals
        'KP',       # Key Passes
        'PrgP',     # Progressive Passes
        'Cmp',      # Passes Completed
        'Att',      # Passes Attempted
        'Cmp%',     # Pass Completion %
        'TB',       # Through Balls
        'Crs',      # Crosses
        'CK',       # Corner Kicks
        '1/3',      # passes into the final third
        'PPA',      # passes into pen area
    ]
    possession_cols = [
        'Touches',  # Total Touches
        'PrgC',     # Progressive Carries
        'Succ',     # Successful Dribbles
        'Att_stats_possession', # Dribbles Attempted
        'PrgR',     # Progressive Passes Received
        'SCA90',    # Shot Creating actions 
        'GCA90'     # Goal Creating actions
    ]
    shooting_cols = [
        'Gls',      # Goals
        'xG',       # Expected Goals
        'Sh',       # Shots 
        'SoT',      # Shots on Target 
        'G/Sh',     # Goals per Shot 
        'G/SoT',    # Goals per Shot on Target 
    ]
    defensive_cols = [
        'Tkl',      # Tackles
        'TklW',     # Tackles Won
        'Int',      # Interceptions
        'Blocks_stats_defense', # Blocks (defensive)
        'Clr',      # Clearances
        'Err'       # Errors
    ]
    all_relevant_cols = identifier_cols + passing_cols + possession_cols + shooting_cols + defensive_cols   
    mf_df = mf_df[all_relevant_cols].copy()
    numeric_stat_cols = [
        'Ast', 'xAG', 'KP', 'PrgP', 'Cmp', 'Att', 'TB', 'Crs', 'CK', '1/3', 'PPA',
        'Touches', 'PrgC', 'Succ', 'Att_stats_possession', 'PrgR', 'SCA90', 'GCA90',
        'Gls', 'xG', 'Sh', 'SoT', 'G/Sh', 'G/SoT',
        'Tkl', 'TklW', 'Int', 'Blocks_stats_defense', 'Clr', 'Err'
    ]
    
    for col in numeric_stat_cols:
        if col in mf_df.columns:
            mf_df[col] = mf_df[col].fillna(0)
    cols_to_normalize = [
        'Ast', 'xAG', 'KP', 'PrgP', 'Cmp', 'Att', 'TB', 'Crs', 'CK', '1/3', 'PPA',
        'Touches', 'PrgC', 'Succ', 'Att_stats_possession', 'PrgR',
        'Gls', 'xG', 'Sh', 'SoT',
        'Tkl', 'TklW', 'Int', 'Blocks_stats_defense', 'Clr', 'Err'
    ]
    for col in cols_to_normalize:
        if col in mf_df.columns:
            mf_df[f'{col}_p90'] = (mf_df[col] / mf_df['90s']).round(3)
    mf_df.replace([np.inf, -np.inf], 0, inplace=True)
    print(f"Original DataFrame size: {df.shape}")
    print(f"New Midfielder DataFrame size: {mf_df.shape}")
    
    return mf_df

midfielders_df = clean_midfielder_data()
midfielders_df.describe()
midfielders_df.to_csv("Processed-Midfielders.csv", index=False)


Original DataFrame size: (2854, 267)
New Midfielder DataFrame size: (908, 67)


CLEANING FORWARDS

In [5]:

def CleanForwards(csv_path="Players-DataSet.csv"):
    df = pd.read_csv(csv_path)
    FW_df = df[df['Pos'].str.contains('FW', na=False)].copy()
    FW_df = FW_df[FW_df['90s'] >= 5.0].copy()
    identifier_cols = [
        'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'MP', 'Starts', 'Min', '90s'
    ]
    passing_cols = [
        'Ast',      # Assists
        'xAG',      # Expected Assisted Goals
        'KP',       # Key Passes
        'PrgP',     # Progressive Passes
        'TB',       # Through Balls
        'Crs',      # Crosses
        'PPA',      # passes into pen area
    ]
    possession_cols = [
        'Succ',     # Successful Dribbles
        'Att_stats_possession', # Dribbles Attempted
        'PrgR',     # Progressive Passes Received
        'SCA90',    # Shot Creating actions 
        'GCA90'     # Goal Creating actions
    ]
    shooting_cols = [
        'Gls',      # Goals
        'xG',       # Expected Goals
        'Sh',       # Shots 
        'SoT',      # Shots on Target 
        'G/Sh',     # Goals per Shot 
        'G/SoT',    # Goals per Shot on Target 
    ]
    all_relevant_cols = identifier_cols + passing_cols + possession_cols + shooting_cols
    FW_df = FW_df[all_relevant_cols].copy()
    numeric_stat_cols = [
        'Ast', 'xAG', 'KP', 'PrgP','TB', 'Crs', 'PPA',
        'Succ', 'Att_stats_possession', 'PrgR', 'SCA90', 'GCA90',
        'Gls', 'xG', 'Sh', 'SoT', 'G/Sh', 'G/SoT',
    ]
    
    for col in numeric_stat_cols:
        if col in FW_df.columns:
            FW_df[col] = FW_df[col].fillna(0)
    cols_to_normalize = [
        'Ast', 'xAG', 'KP', 'PrgP', 'TB', 'Crs', 'PPA',
        'Succ', 'Att_stats_possession', 'PrgR',
        'Gls', 'xG', 'Sh', 'SoT',
    ]
    for col in cols_to_normalize:
        if col in FW_df.columns:
            FW_df[f'{col}_p90'] = (FW_df[col] / FW_df['90s']).round(3)
    FW_df.replace([np.inf, -np.inf], 0, inplace=True)
    print(f"Original DataFrame size: {df.shape}")
    print(f"New Forward DataFrame size: {FW_df.shape}")
    
    return FW_df

midfielders_df = CleanForwards()
midfielders_df.describe()
midfielders_df.to_csv("Processed-Forwards.csv", index=False)


Original DataFrame size: (2854, 267)
New Midfielder DataFrame size: (649, 42)
