In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

In [32]:
# Load the datasets
historical_data = pd.read_csv('Data/2021-2022 Football Player Stats.csv', sep = ';', encoding_errors = 'ignore')
unclassified_data = pd.read_csv('Data\player-data\player-combined-data.csv')
historical_data['Type'] = 'Historical'
unclassified_data['Type'] = 'Unclassified'

# Combine datasets for uniform processing
combined_data = pd.concat([historical_data, unclassified_data], ignore_index=True)

# Feature Engineering

# Normalize performance metrics by minutes played (90s)
performance_cols = ['Goals', 'Assists', 'Shots', 'PasTotCmp', 'PasTotAtt', 'Tkl', 'Press']
for col in performance_cols:
    combined_data[f'{col}_per_90'] = combined_data[col] / combined_data['90s']

# Calculate efficiency metrics
combined_data['Goals_per_Shot'] = combined_data['Goals'] / combined_data['Shots']
combined_data['Assists_per_90'] = combined_data['Assists'] / combined_data['90s']

# Combine passing ability features
combined_data['Pass_Efficiency'] = (
    combined_data['PasTotCmp%'] * combined_data['PasTotPrgDist']
)

# Defensive performance
combined_data['Tackles_Interceptions'] = combined_data['Tkl'] + combined_data['Int']

# Aerial ability
combined_data['Aerial_Duels_Won%'] = combined_data['AerWon'] / (combined_data['AerWon'] + combined_data['AerLost'])

# Physical & technical skills
combined_data['Dribble_Efficiency'] = combined_data['DriSucc%']
combined_data['Progression_Distance'] = combined_data['CarPrgDist']

  unclassified_data = pd.read_csv('Data\player-data\player-combined-data.csv')


In [37]:
combined_data.describe()

Unnamed: 0,Rk,Age,Born,MP,Starts,Min,90s,Goals,Shots,SoT,...,PasTotCmp_per_90,PasTotAtt_per_90,Tkl_per_90,Press_per_90,Goals_per_Shot,Pass_Efficiency,Tackles_Interceptions,Aerial_Duels_Won%,Dribble_Efficiency,Progression_Distance
count,2921.0,2920.0,2921.0,2921.0,2921.0,2921.0,2921.0,2921.0,2921.0,2921.0,...,2899.0,2910.0,2865.0,2897.0,2365.0,2921.0,2921.0,2574.0,2921.0,2921.0
mean,1461.0,26.092123,1994.725094,18.800068,13.749743,1234.756248,13.719069,0.111274,1.220431,0.391462,...,inf,inf,inf,inf,inf,16138.461921,3.02557,0.45943,46.017357,89.95875
std,843.364393,4.641746,37.210426,11.619882,11.393763,977.941288,10.865255,0.233688,1.511266,0.784754,...,,,,,,11397.294089,2.205191,0.204731,29.356537,56.300199
min,1.0,16.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,731.0,23.0,1992.0,8.0,3.0,307.0,3.4,0.0,0.28,0.0,...,1.471209,1.900394,0.04697,0.559471,0.0,6836.75,1.57,0.334279,32.1,52.8
50%,1461.0,26.0,1996.0,20.0,12.0,1102.0,12.2,0.0,0.82,0.19,...,2.650602,3.370958,0.101031,1.106195,0.06,14865.0,3.06,0.468632,50.0,83.7
75%,2191.0,29.0,1999.0,29.0,23.0,2025.0,22.5,0.15,1.83,0.56,...,8.235829,11.118687,0.25,3.657895,0.137931,23020.62,4.22,0.591408,64.4,116.8
max,2921.0,41.0,2006.0,38.0,38.0,3420.0,38.0,5.0,20.0,20.0,...,inf,inf,inf,inf,inf,83000.0,20.0,1.0,100.0,890.0
