In [120]:
import requests
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)

In [121]:
pl_players = pd.read_csv("../data/processed/fbref/ENG-Premier League/2024-2025/player_match/summary.csv")
player_seasons = pd.read_csv("../data/processed/fbref/ENG-Premier League/2024-2025/player_season/defense.csv")

match_df = pd.DataFrame(pl_players)
season_df = pd.DataFrame(player_seasons)

In [122]:
df = match_df.copy()

# Peek at dtypes & non-numeric offenders
display(df.dtypes.sort_values())
non_num = df.columns.difference(df.select_dtypes(include=[np.number]).columns)
print("Non-numeric columns:", list(non_num)[:20])

# Replace infs, drop exact-constant columns (incl. all-NaN)
df = df.replace([np.inf, -np.inf], np.nan)
const_cols = df.columns[df.nunique(dropna=True) <= 1]
df = df.drop(columns=const_cols)
df = df.drop(columns=[ "jersey_number", "age", "is_promoted"])
print(f"Dropped constant cols: {list(const_cols)}")

is_relegated       int64
gca                int64
sca                int64
prgp               int64
carries            int64
blocks             int64
int                int64
tkl                int64
touches            int64
crdr               int64
crdy               int64
sot                int64
cmp                int64
sh                 int64
pk                 int64
ast                int64
gls                int64
min                int64
prgc               int64
att_att            int64
succ               int64
jersey_number      int64
is_promoted        int64
is_home            int64
is_away            int64
season             int64
pkatt              int64
att                int64
cmp_cmp          float64
npxg             float64
xg               float64
age              float64
xag              float64
fpl_pos           object
position          object
opponent_id       object
game              object
team              object
player            object
team_id           object


Non-numeric columns: ['away', 'fpl_pos', 'game', 'game_date', 'game_id', 'home', 'league', 'nation', 'opponent_id', 'player', 'player_id', 'pos', 'position', 'team', 'team_id']
Dropped constant cols: ['league', 'season', 'is_relegated']


In [123]:
# Treat bools as ints and join with numerics
num = pd.concat(
    [
        df.select_dtypes(include=[np.number]).astype('float64'),
        df.select_dtypes(include=['bool']).astype('int8')
    ],
    axis=1
)

# Drop constants again after cast
num = num.loc[:, num.nunique(dropna=True) > 1]

# Spearman handles monotonic but non-linear relationships
corr_s = num.corr(method='spearman', min_periods=25)  # tweak min_periods if needed
corr_s

Unnamed: 0,min,gls,ast,pk,pkatt,sh,sot,crdy,crdr,touches,tkl,int,blocks,xg,npxg,xag,sca,gca,cmp,att,cmp_cmp,prgp,carries,prgc,att_att,succ,is_home,is_away
min,1.0,0.065657,0.067141,0.02399,0.024802,0.165047,0.10887,0.089002,-0.039959,0.77999,0.304,0.291649,0.296478,0.132839,0.133173,0.156132,0.269773,0.100341,0.720678,0.7539,0.050081,0.386764,0.721684,0.196508,0.150295,0.145365,0.000163,-0.000163
gls,0.065657,1.0,0.065963,0.246134,0.227578,0.381122,0.5648,-0.010625,0.002439,0.001941,0.002649,-0.013079,0.021906,0.476818,0.458349,0.115228,0.148084,0.16256,-0.04022,-0.033412,-0.079646,0.055612,0.024215,0.149992,0.146871,0.119754,0.004181,-0.004181
ast,0.067141,0.065963,1.0,0.018558,0.021836,0.138515,0.098943,0.002109,-0.007372,0.077394,0.043848,0.014684,0.038725,0.129991,0.128702,0.44437,0.272138,0.65054,0.060214,0.064052,-0.02135,0.128306,0.09186,0.17481,0.137247,0.1143,0.005314,-0.005314
pk,0.02399,0.246134,0.018558,1.0,0.909955,0.063304,0.030971,-0.008819,-0.004973,-0.008215,-0.021884,-0.017232,-0.008933,0.149477,0.063454,0.047454,0.062397,0.079056,-0.024461,-0.018022,-0.037828,0.013577,0.003187,0.046927,0.055609,0.039969,0.012928,-0.012928
pkatt,0.024802,0.227578,0.021836,0.909955,1.0,0.067346,0.030714,-0.010435,-0.005465,-0.009463,-0.02268,-0.023726,-0.013822,0.1641,0.062736,0.055859,0.069671,0.079131,-0.026308,-0.019522,-0.041193,0.014531,0.00141,0.047849,0.05793,0.037334,0.017119,-0.017119
sh,0.165047,0.381122,0.138515,0.063304,0.067346,1.0,0.64654,0.011391,-0.012962,0.156869,0.103117,0.017699,0.097,0.823504,0.828466,0.287224,0.452094,0.19141,0.074406,0.085652,-0.10133,0.271968,0.185223,0.3679,0.344113,0.276107,0.027283,-0.027283
sot,0.10887,0.5648,0.098943,0.030971,0.030714,0.64654,1.0,0.004537,-0.010803,0.058273,0.037679,-0.015355,0.051131,0.624131,0.628952,0.189492,0.289664,0.163081,-0.003401,0.005682,-0.096877,0.141321,0.088066,0.258192,0.240289,0.195728,0.022131,-0.022131
crdy,0.089002,-0.010625,0.002109,-0.008819,-0.010435,0.011391,0.004537,1.0,0.088283,0.108592,0.085183,0.09476,0.07823,-0.002429,-0.002213,0.014948,0.039048,0.005367,0.100166,0.100061,0.017226,0.089102,0.087182,0.019687,0.021928,0.03035,-0.02459,0.02459
crdr,-0.039959,0.002439,-0.007372,-0.004973,-0.005465,-0.012962,-0.010803,0.088283,1.0,-0.011661,-0.004972,0.01913,-0.012539,-0.02128,-0.020893,-0.011964,-0.010805,-0.004026,-0.006398,-0.010202,0.009289,0.005849,-0.013279,-0.017112,-0.005175,-0.003741,6e-06,-6e-06
touches,0.77999,0.001941,0.077394,-0.008215,-0.009463,0.156869,0.058273,0.108592,-0.011661,1.0,0.423608,0.399251,0.352902,0.07935,0.082567,0.191492,0.364487,0.116166,0.965824,0.982712,0.240218,0.637098,0.934044,0.275426,0.174702,0.174431,0.019027,-0.019027


In [124]:
def high_corr_pairs(C: pd.DataFrame, thr=0.95):
    a = C.abs()
    tri = a.where(np.triu(np.ones(a.shape), k=1).astype(bool))
    pairs = (tri.stack()
                 .reset_index()
                 .rename(columns={'level_0':'feat_1','level_1':'feat_2',0:'|rho|'})
                 .query('`|rho|` >= @thr')
                 .sort_values('|rho|', ascending=False)
                 .reset_index(drop=True))
    return pairs

high_pairs = high_corr_pairs(corr_s, thr=0.92)  # 0.9–0.95 is typical
display(high_pairs.head(30))


Unnamed: 0,feat_1,feat_2,|rho|
0,is_home,is_away,1.0
1,xg,npxg,0.991044
2,cmp,att,0.984572
3,touches,att,0.982712
4,touches,cmp,0.965824
5,att,carries,0.935657
6,touches,carries,0.934044
7,cmp,carries,0.928357


In [125]:
target_col = "min"   # <- CHANGE THIS
assert target_col in df.columns

# Build a simple feature matrix: numerics (+ booleans already handled)
X = num.loc[:, num.columns != target_col].copy()
y = df[target_col]

# Drop rows where y is NaN
keep = y.notna()
X, y = X.loc[keep], y.loc[keep]

# Spearman corr with target
with_target = (
    X.apply(lambda s: s.corr(y, method='spearman', min_periods=25))
     .dropna()
     .to_frame('rho_spearman')
     .assign(abs_rho=lambda d: d['rho_spearman'].abs())
     .sort_values('abs_rho', ascending=False)
)
display(with_target.head(40))


Unnamed: 0,rho_spearman,abs_rho
touches,0.77999,0.77999
att,0.7539,0.7539
carries,0.721684,0.721684
cmp,0.720678,0.720678
prgp,0.386764,0.386764
tkl,0.304,0.304
blocks,0.296478,0.296478
int,0.291649,0.291649
sca,0.269773,0.269773
prgc,0.196508,0.196508
