In [1]:
from pathlib import Path
import pandas as pd

BASE = Path("..").resolve()
RAW = BASE / "data" / "raw" / "fbref"

# –∫–∞–∫–∏–µ —Å–µ–∑–æ–Ω—ã –¥–ª—è EPL —É–∂–µ –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å
SEASONS = ["2019-2020","2020-2021","2021-2022","2022-2023","2023-2024","2024-2025"]
LEAGUE_CODE = "epl"

In [2]:
def read_csv_safe(p: Path) -> pd.DataFrame | None:
    if not p.exists():
        print(f"‚ö†Ô∏è not found: {p}")
        return None
    try:
        return pd.read_csv(p)
    except Exception as e:
        print(f"‚ùå read fail: {p.name} -> {e}")
        return None

In [3]:
rows, cols, samples = [], [], {}
for s in SEASONS:
    folder = RAW / f"{LEAGUE_CODE}_{s}"
    f = folder / "player_standard_stats.csv"
    df = read_csv_safe(f)
    if df is None:
        continue
    rows.append({"season": s, "rows": len(df), "cols": df.shape[1]})
    cols.append(tuple(df.columns))
    # —Å–æ—Ö—Ä–∞–Ω–∏–º –ø–æ –æ–¥–Ω–æ–º—É –ø—Ä–∏–º–µ—Ä—É –¥–ª—è –±—ã—Å—Ç—Ä–æ–π –≤–∏–∑—É–∞–ª—å–Ω–æ–π –ø—Ä–æ–≤–µ—Ä–∫–∏
    samples[s] = df.head(3)

summary = pd.DataFrame(rows)
display(summary)

Unnamed: 0,season,rows,cols
0,2019-2020,542,37
1,2020-2021,553,37
2,2021-2022,567,37
3,2022-2023,591,37
4,2023-2024,603,37
5,2024-2025,596,37


In [4]:
unique_structs = {c for c in cols}
print(f"–£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –Ω–∞–±–æ—Ä–æ–≤ —Å—Ç–æ–ª–±—Ü–æ–≤: {len(unique_structs)}")
if len(unique_structs) == 1:
    print("‚úÖ –°—Ç—Ä—É–∫—Ç—É—Ä–∞ –∫–æ–ª–æ–Ω–æ–∫ –æ–¥–∏–Ω–∞–∫–æ–≤–∞ –≤–æ –≤—Å–µ—Ö —Å–µ–∑–æ–Ω–∞—Ö")
else:
    print("‚ö†Ô∏è –†–∞–∑–Ω—ã–µ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –∫–æ–ª–æ–Ω–æ–∫ –ø–æ —Å–µ–∑–æ–Ω–∞–º ‚Äî –Ω—É–∂–Ω–æ —Å—Ä–∞–≤–Ω–∏–≤–∞—Ç—å –Ω–∏–∂–µ")

# –µ—Å–ª–∏ –µ—Å—Ç—å —Ä–∞—Å—Ö–æ–∂–¥–µ–Ω–∏—è ‚Äî –≤—ã–≤–µ—Å—Ç–∏ —Ä–∞–∑–ª–∏—á–∏—è
if len(unique_structs) > 1:
    structs = list(unique_structs)
    base = set(structs[0])
    for i, st in enumerate(structs[1:], 1):
        st_set = set(st)
        missing = base - st_set
        extra = st_set - base
        print(f"\n–°—Ä–∞–≤–Ω–µ–Ω–∏–µ —Å —à–∞–±–ª–æ–Ω–æ–º (–≤–∞—Ä–∏–∞–Ω—Ç {i}):")
        if missing: print("  ‚àí –æ—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç:", sorted(missing))
        if extra:   print("  + –ª–∏—à–Ω–∏–µ:", sorted(extra))

–£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –Ω–∞–±–æ—Ä–æ–≤ —Å—Ç–æ–ª–±—Ü–æ–≤: 1
‚úÖ –°—Ç—Ä—É–∫—Ç—É—Ä–∞ –∫–æ–ª–æ–Ω–æ–∫ –æ–¥–∏–Ω–∞–∫–æ–≤–∞ –≤–æ –≤—Å–µ—Ö —Å–µ–∑–æ–Ω–∞—Ö


In [5]:
last = SEASONS[-1]
df_last = read_csv_safe(RAW / f"{LEAGUE_CODE}_{last}" / "player_standard_stats.csv")
if df_last is not None:
    print(f"üîé –ü—Ä–æ–≤–µ—Ä–∫–∞ NaN –¥–ª—è {last}:")
    na = df_last.isna().sum()
    na = na[na > 0].sort_values(ascending=False)
    display(na.head(20) if not na.empty else pd.Series(dtype=int, name="no_na"))
    
    print("\nüìä –¢–∏–ø—ã –¥–∞–Ω–Ω—ã—Ö (–ø–µ—Ä–≤—ã–µ 20 –∫–æ–ª–æ–Ω–æ–∫):")
    display(df_last.dtypes.head(20).to_frame("dtype").T)
    
    print("\nüëÄ –ü—Ä–∏–º–µ—Ä —Å—Ç—Ä–æ–∫:")
    display(df_last.head(5))

üîé –ü—Ä–æ–≤–µ—Ä–∫–∞ NaN –¥–ª—è 2024-2025:


nation    4
age       3
born      3
dtype: int64


üìä –¢–∏–ø—ã –¥–∞–Ω–Ω—ã—Ö (–ø–µ—Ä–≤—ã–µ 20 –∫–æ–ª–æ–Ω–æ–∫):


Unnamed: 0,rk,player,nation,pos,squad,age,born,mp,starts,min,90s,gls,ast,g_plus_a,g_pk,pk,pkatt,crdy,crdr,xg
dtype,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object



üëÄ –ü—Ä–∏–º–µ—Ä —Å—Ç—Ä–æ–∫:


Unnamed: 0,rk,player,nation,pos,squad,age,born,mp,starts,min,...,ast_per90,g_plus_a_per90,g_pk_per90,g_plus_a_pk_per90,xg_per90,xag_per90,xg_xag_per90,npxg_per90,npxg_xag_per90,matches
0,1,Max Aarons,eng ENG,DF,Bournemouth,24,2000,3,1,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,18,2006,4,2,170,...,0.0,0.0,0.0,0.0,0.12,0.0,0.12,0.12,0.12,Matches
2,3,Tyler Adams,us USA,MF,Bournemouth,25,1999,28,21,1965,...,0.14,0.14,0.0,0.14,0.07,0.05,0.12,0.07,0.12,Matches
3,4,Tosin Adarabioyo,eng ENG,DF,Chelsea,26,1997,22,15,1409,...,0.06,0.13,0.06,0.13,0.06,0.01,0.07,0.06,0.07,Matches
4,5,Simon Adingra,ci CIV,"FW,MF",Brighton,22,2002,29,12,1097,...,0.16,0.33,0.16,0.33,0.2,0.2,0.4,0.2,0.4,Matches


In [6]:
# –°–∫–ª–µ–∏–≤–∞–µ–º player_standard_stats –ø–æ –≤—Å–µ–º —Å–µ–∑–æ–Ω–∞–º EPL + –±–∞–∑–æ–≤–∞—è –æ—á–∏—Å—Ç–∫–∞

CATEGORICAL = {"player","nation","pos","squad","season"}

def load_players_one(season: str) -> pd.DataFrame | None:
    p = RAW / f"{LEAGUE_CODE}_{season}" / "player_standard_stats.csv"
    if not p.exists():
        print(f"‚ö†Ô∏è –Ω–µ—Ç —Ñ–∞–π–ª–∞: {p}")
        return None
    df = pd.read_csv(p)

    # –¥–æ–±–∞–≤–∏—Ç—å —Å–µ–∑–æ–Ω
    df["season"] = season

    # —É–±—Ä–∞—Ç—å —Å–ª—É–∂–µ–±–Ω—É—é –∫–æ–ª–æ–Ω–∫—É, –µ—Å–ª–∏ –ø–æ–ø–∞–ª–∞
    if "matches" in df.columns:
        df = df.drop(columns=["matches"])

    # –≤—ã–±—Ä–æ—Å–∏—Ç—å –∞—Ä—Ç–µ—Ñ–∞–∫—Ç—ã ¬´—à–∞–ø–æ–∫¬ª –≤–Ω—É—Ç—Ä–∏ —Ç–∞–±–ª–∏—Ü
    for col in ["player", "rk"]:
        if col in df.columns:
            df = df[df[col].astype(str).str.lower() != col]

    # –ø—Ä–∏–≤–µ—Å—Ç–∏ —á–∏—Å–ª–∞ –∏–∑ object ‚Üí numeric (–∫–∞–∫ –ø–æ–ª—É—á–∏—Ç—Å—è)
    for c in df.columns:
        if c not in CATEGORICAL:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # –∫–æ—Å–º–µ—Ç–∏–∫–∞
    if "player" in df.columns:
        df["player"] = df["player"].astype(str).str.strip()
    if "squad" in df.columns:
        df["squad"] = df["squad"].astype(str).str.strip()

    return df.reset_index(drop=True)

players_all = pd.concat(
    [d for s in SEASONS if (d := load_players_one(s)) is not None],
    ignore_index=True
)

print(f"‚úÖ players_all: {players_all.shape} (rows, cols)")
display(players_all.head(10))

‚úÖ players_all: (3323, 37) (rows, cols)


Unnamed: 0,rk,player,nation,pos,squad,age,born,mp,starts,min,...,ast_per90,g_plus_a_per90,g_pk_per90,g_plus_a_pk_per90,xg_per90,xag_per90,xg_xag_per90,npxg_per90,npxg_xag_per90,season
0,1,Patrick van Aanholt,nl NED,DF,Crystal Palace,28.0,1990.0,29,29,2507,...,0.07,0.18,0.07,0.14,0.08,0.13,0.21,0.05,0.18,2019-2020
1,2,Max Aarons,eng ENG,DF,Norwich City,19.0,2000.0,36,36,3240,...,0.03,0.03,0.0,0.03,0.02,0.11,0.12,0.02,0.12,2019-2020
2,3,Tammy Abraham,eng ENG,FW,Chelsea,21.0,1997.0,34,25,2215,...,0.12,0.73,0.61,0.73,0.67,0.11,0.77,0.67,0.77,2019-2020
3,4,Che Adams,sct SCO,FW,Southampton,23.0,1996.0,30,12,1111,...,0.16,0.49,0.32,0.49,0.43,0.14,0.57,0.43,0.57,2019-2020
4,5,Adri√°n,es ESP,GK,Liverpool,32.0,1987.0,11,9,875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019-2020
5,6,Sergio Ag√ºero,ar ARG,FW,Manchester City,31.0,1988.0,24,18,1452,...,0.19,1.18,0.87,1.05,0.9,0.14,1.04,0.75,0.89,2019-2020
6,7,Albian Ajeti,ch SUI,FW,West Ham,22.0,1997.0,9,0,139,...,0.0,0.0,0.0,0.0,0.11,0.05,0.16,0.11,0.16,2019-2020
7,8,Nathan Ak√©,nl NED,DF,Bournemouth,24.0,1995.0,29,29,2503,...,0.07,0.14,0.07,0.14,0.11,0.08,0.18,0.11,0.18,2019-2020
8,9,Marc Albrighton,eng ENG,"MF,DF",Leicester City,29.0,1989.0,20,9,867,...,0.31,0.31,0.0,0.31,0.04,0.22,0.27,0.04,0.27,2019-2020
9,10,Toby Alderweireld,be BEL,DF,Tottenham,30.0,1989.0,33,33,2957,...,0.06,0.12,0.06,0.12,0.02,0.04,0.06,0.02,0.06,2019-2020


In [7]:
# –í–ê–õ–ò–î–ê–¶–ò–Ø –ò –°–û–•–†–ê–ù–ï–ù–ò–ï PLAYERS (EPL)

from pathlib import Path
import pandas as pd
import numpy as np

# –Ω–∞ —Å–ª—É—á–∞–π –µ—Å–ª–∏ –Ω–µ –∑–∞–¥–∞–Ω–æ
try:
    LEAGUE_CODE
except NameError:
    LEAGUE_CODE = "epl"

try:
    BASE
except NameError:
    BASE = Path("..").resolve()

PROCESSED_PLAYERS = BASE / "data" / "processed" / "players"
PROCESSED_PLAYERS.mkdir(parents=True, exist_ok=True)
out_path = PROCESSED_PLAYERS / f"{LEAGUE_CODE}_players_all.csv"

# --- –≤–∞–ª–∏–¥–∞—Ü–∏–∏ ---
print(f"‚úÖ Shape: {players_all.shape}")

# –¥—É–±–ª–∏–∫–∞—Ç—ã –ø–æ –∫–ª—é—á—É (–∏–≥—Ä–æ–∫ –≤ —Å–æ—Å—Ç–∞–≤–µ –∫–ª—É–±–∞ –≤ —Å–µ–∑–æ–Ω–µ)
dup_mask = players_all.duplicated(subset=["season","player","squad"], keep=False)
dup_cnt = int(dup_mask.sum())
print(f"üîé –î—É–±–ª–∏–∫–∞—Ç—ã –ø–æ (season, player, squad): {dup_cnt}")
if dup_cnt:
    display(players_all.loc[dup_mask, ["season","player","squad","pos","mp","starts","min"]].head(10))

# –ø—Ä–æ–ø—É—Å–∫–∏ ‚Äî —Ç–æ–ø-10 –∫–æ–ª–æ–Ω–æ–∫
na = players_all.isna().sum()
na = na[na>0].sort_values(ascending=False)
if len(na):
    print("\n‚ö†Ô∏è –ü—Ä–æ–ø—É—Å–∫–∏ –Ω–∞–π–¥–µ–Ω—ã (—Ç–æ–ø-10):")
    display(na.head(10))
else:
    print("\n‚úÖ –ü—Ä–æ–ø—É—Å–∫–æ–≤ –Ω–µ—Ç")

# –±–∞–∑–æ–≤—ã–µ –¥–∏–∞–ø–∞–∑–æ–Ω—ã –ø–æ —á–∏—Å–ª–æ–≤—ã–º –ø–æ–ª—è–º
num_cols = ["mp","starts","min","90s","gls","ast","xg","xag","npxg"]
present = [c for c in num_cols if c in players_all.columns]
desc = players_all[present].describe().T[["min","mean","max"]].round(2)
print("\nüìä –î–∏–∞–ø–∞–∑–æ–Ω—ã –∫–ª—é—á–µ–≤—ã—Ö —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫:")
display(desc)

# --- —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ ---
players_all.to_csv(out_path, index=False, encoding="utf-8")
print(f"\nüìÅ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤: {out_path}")

‚úÖ Shape: (3323, 37)
üîé –î—É–±–ª–∏–∫–∞—Ç—ã –ø–æ (season, player, squad): 0

‚ö†Ô∏è –ü—Ä–æ–ø—É—Å–∫–∏ –Ω–∞–π–¥–µ–Ω—ã (—Ç–æ–ø-10):


nation    7
age       3
born      3
dtype: int64


üìä –î–∏–∞–ø–∞–∑–æ–Ω—ã –∫–ª—é—á–µ–≤—ã—Ö —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫:


Unnamed: 0,min,mean,max
mp,1.0,19.8,38.0
starts,0.0,15.09,38.0
min,1.0,1356.07,3420.0
90s,0.0,15.07,38.0
gls,0.0,1.91,36.0
ast,0.0,1.36,20.0
xg,0.0,1.96,29.2
xag,0.0,1.4,20.0
npxg,0.0,1.82,23.0



üìÅ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤: /Users/kekc/Projects/GIT/sports-stats-analysis/data/processed/players/epl_players_all.csv


In [8]:
from pathlib import Path
import pandas as pd

BASE = Path("..").resolve()
RAW = BASE / "data" / "raw" / "fbref"
OUT = BASE / "data" / "processed" / "players"
OUT.mkdir(parents=True, exist_ok=True)

league_prefix = "laliga_"
files = sorted((p for p in RAW.glob(f"{league_prefix}*/player_standard_stats.csv")), key=lambda p: p.parent.name)

dfs = []
for f in files:
    season = f.parent.name.split("_", 1)[1]
    df = pd.read_csv(f)
    df["season"] = season
    dfs.append(df)

players_la = pd.concat(dfs, ignore_index=True)

print(f"‚úÖ La Liga players: {players_la.shape} (rows, cols)")
dups = players_la.duplicated(subset=["season","player","squad"]).sum()
print(f"üîé –î—É–±–ª–∏–∫–∞—Ç—ã –ø–æ (season, player, squad): {dups}")

na = players_la.isna().sum()
na_top = na[na > 0].sort_values(ascending=False).head(10)
if len(na_top):
    print("\n‚ö†Ô∏è –ü—Ä–æ–ø—É—Å–∫–∏ (—Ç–æ–ø-10):")
    display(na_top)
else:
    print("\n‚úÖ –ü—Ä–æ–ø—É—Å–∫–æ–≤ –Ω–µ—Ç –≤ —Ç–æ–ø-–∫–æ–ª–æ–Ω–∫–∞—Ö")

num_cols = ["mp","starts","min","90s","gls","ast","xg","xag","npxg"]
rng = players_la[num_cols].agg(["min","mean","max"]).round(2).T
print("\nüìä –î–∏–∞–ø–∞–∑–æ–Ω—ã –∫–ª—é—á–µ–≤—ã—Ö —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫:")
display(rng)

out_path = OUT / "laliga_players_all.csv"
players_la.to_csv(out_path, index=False)
print(f"üìÅ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤: {out_path}")

‚úÖ La Liga players: (3715, 38) (rows, cols)
üîé –î—É–±–ª–∏–∫–∞—Ç—ã –ø–æ (season, player, squad): 134

‚úÖ –ü—Ä–æ–ø—É—Å–∫–æ–≤ –Ω–µ—Ç –≤ —Ç–æ–ø-–∫–æ–ª–æ–Ω–∫–∞—Ö


TypeError: Could not convert string '11423313227361327641718232107132813151113534MP62441414123493714320926253619116933530812MP1730122853733212333301614112483128351523133MP334112181715271723792036351935151124311335352MP301316424373241231712632433124823196341822MP37272365521636303423515368311222251833181830MP20233112633610111735110124291713322527336434MP2020341617122017371452923173632830123543082MP425122173337131819261011411352835242121456MP2116172995211930525143435329372338224203429MP35343736178335291751927112322729243535311510MP831331422283122429291732526323611743262524MP3813617335272981513121512136613312053534MP34242521311625233622933297153432313427291153418MP31359361263115118236184233892263551572034MP381334352352624233029137271351353332132026MP2102226342613220735363013163528111932036MP1110191526329232312477353253136256281549MP30333528272613230371481191162519212571943125MP9172885251129219221232635143335342421314613MP332813871130262811761418273622218142034283436MP14292513353322136111133322131171821202918293330MP3023194102853011635383263036339531330326251310135352713302222222723123231142MP41343324344836353312822330622231263312731MP1913221321342335211341919333429161202523014MP1362028252393142816328333153032113273411110MP212821121035172237381529714381226123377530MP302712617333435284272329293112238343827523MP93242511311347333131292522161251125313713307MP1114321637103722271121273525834372172481284MP11341331351892925271912323624293031311025173624MP6241431241633322751331028620283425301927172727MP6373422353313372917234163535624128373143237MP52533428131823221151333234182733242813723179MP221726292333851132252352831352871633252221165MP93319228134253322934331262613253520221416321MP342738262235172334362622511305233330142627228MP163313226232411313423572212331617362553138MP341325141895313120303338111236331293712192722MP32428221493332418193611222163722130351433819MP1253121375201517311381322335191315182515223320MP354162217310281272162128241624134161619332MP933291112112323736119516162932282815131353234MP1232322931301219333335253528281328532130243111MP15824162833303232012342135211091335323821910MP4121252431783124441322528302330281814292033122143195MP292313134303812146231371719193626172415526MP3561302521161123626129233714413214341913124MP4351712283611037101411672620342513025241341MP342924333214361133911727251731196178351201836MP1922253612123132292336362621621252912217313835MP764822212334113030323231742737128102228MP110434262413322433922925162537223136323526342MP2435371181234132222829292931125282192734132MP11532628322136171031318313316171812615169MP21362354205122033813214914331453792133221MP3631323761527336356223033103112832282834720MP111373823302524421529161329183692732312224352MP22535352126181717161220221124912996571627MP261332223518112129281373222103441122343432331934MP2913322833353614182467243017321427211628352019MP1629112861825343263231323495383052283231MP1012234102634332131928123121718102121815151723MP1062973127132737131552313011331513272281917MP1735329112118353121143727915302113291811363233MP25141981832193102312272932133253124721MP281105142825922134413322618322638161353835MP14813123352923143357134181826332833113235158MP137153353365119312915172973419323231359171MP191072421611938341182431826361313083435291122344242531253135126193424122MP287285171522313228217121031371429293419MP363533337281827232115433642429372821317312535MP31251231234303232763738317282710110910352733MP1352535253015361622337211432321522335281643527MP3232312527610511210383018125292537331524MP31431430252536291311423362028182827251368153MP231420212811241272537222730242237303035111253224MP3813529211113628174331833319291162515262MP27912121382614173410816253530283428826332625MP23353117172051625323524212126221223323262834MP2219533215133661228352633420293035252832191824MP193310343120272934381212222202103428431297511MP35142292653822215193751828910112143942433MP15128281835141278214341181033293327312212621MP223317333373320286361024151121322441223163023MP8235331275141732211931314103628181933233017MP31123171161725243716371621212626722011935MP73614183183663334342734813026372210131363133MP15361122181638113220342337522306173335347341MP1331833261014323328624333632253231122181628MP26731331012338120343232281822543428332738168MP198733323423313423196353343017341010233610MP16362520224516192915236363312510263136223272141263119214263631302911101851114341219MP201519117112525123714126341435176252512834104MP18332732162924317191523283329273533341131217MP14334225313183227323512635173031328222302014MP1722303122311302719179232313034434151265191MP2921363414118314837331322231732361528371536MP332523121226223031943023726104274815115MP7331236175142937321727230252320301914212MP3853128253163491155365331332712222217363726MP310353636323538221431301991923832294131332311MP929187151208131333411161726361782233359MP163533303111311353228142934371273413534163734MP253631323530613282429232121225373741616126728MP16322933136223361221203312935332746262823MP713217123103253102711383415251132302830332132MP2730114353616202021283729292628297368342163424MP38133591143413311164332323378152343315MP1841924311715107292311632242910329733322126MP34232711119126361829202737282812352323726323734MP412114332632912453624371534714132811433118MP351226123144303329262614123371119281436101434MP32292183811728246273731321642911523412116MP27252819202526172927281131311430293343428337376MP371201229165271878491221835123335343135MP10137252913213031417294191435162218111731534101426134231MP223226372124132222617262353512136115121233027MP19131413232121425322512571153030313525132522MP261593134182303631014193534183433035133891MP1063321192325812176161135281352381423351237MP23422214852733344253522222523111193232302534MP351532538341310263112628410173132101913202210MP27312522352216272419326228192191866261732MP191181236128252632103629313383363152636251426MP12373221717120103138133416313633341222822333629MP258531141883729193611328143737922193118344MP372625131112121730261932353232372283635183228MP1427371652334311303325172293817282911332121030MP233827183023419101226127923302226228321330MP32281528323323024343429115271421434123320343MP1720353635303261711633233337252421212335229226MP311723153062413152410302036221122172125375MP916351114535233618123731141625282630272619MP3028193427302435291361931336353283613313616MP51011432222911737163621211830734316252232MP292923122167234312218293332253233161993642415MP3321213325191193862430282316323021291632291535MP34343228434811152101827351119272133234831436MP2113419322725372154216429323529143512232728MP36' to numeric

In [9]:
# --- FIX: –ø—Ä–∏–≤–µ—Å—Ç–∏ —á–∏—Å–ª–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏ –∫ float –∏ –ø–µ—Ä–µ—Å—á–∏—Ç–∞—Ç—å –¥–∏–∞–ø–∞–∑–æ–Ω—ã ---

import re
num_cols = ["mp","starts","min","90s","gls","ast","xg","xag","npxg"]

# –æ—Å—Ç–∞–≤–∏—Ç—å —Ç–æ–ª—å–∫–æ —Å—É—â–µ—Å—Ç–≤—É—é—â–∏–µ –∏–∑ —Å–ø–∏—Å–∫–∞ (–Ω–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π)
num_cols = [c for c in num_cols if c in players_la.columns]

def to_num(s):
    # –≤ —Å—Ç—Ä–æ–∫–∞—Ö –∏–Ω–æ–≥–¥–∞ –≤—Å—Ç—Ä–µ—á–∞—é—Ç—Å—è —Å–∏–º–≤–æ–ª—ã –≤—Ä–æ–¥–µ 'MP', 'Matches', –ø—Ä–æ–±–µ–ª—ã –∏ —Ç.–ø.
    # –≤—ã—á–∏—â–∞–µ–º –≤—Å—ë, –∫—Ä–æ–º–µ —Ü–∏—Ñ—Ä, —Ç–æ—á–∫–∏ –∏ –º–∏–Ω—É—Å–∞, –ø–æ—Å–ª–µ —á–µ–≥–æ –ø—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –∫ —á–∏—Å–ª—É
    return pd.to_numeric(
        pd.Series(s, dtype="string")
          .str.replace(r"[^\d\.\-]", "", regex=True)
          .replace({"": None}),
        errors="coerce"
    )

for c in num_cols:
    players_la[c] = to_num(players_la[c])

# –ø–µ—Ä–µ—Å—á—ë—Ç –¥–∏–∞–ø–∞–∑–æ–Ω–æ–≤
rng = players_la[num_cols].agg(["min","mean","max"]).round(2).T
print("üìä –î–∏–∞–ø–∞–∑–æ–Ω—ã –∫–ª—é—á–µ–≤—ã—Ö —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫ (–ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏):")
display(rng)

üìä –î–∏–∞–ø–∞–∑–æ–Ω—ã –∫–ª—é—á–µ–≤—ã—Ö —á–∏—Å–ª–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫ (–ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏):


Unnamed: 0,min,mean,max
mp,1.0,19.58,38.0
starts,0.0,14.03,38.0
min,1.0,1259.3,3420.0
90s,0.0,16.86,90.0
gls,0.0,1.58,31.0
ast,0.0,1.1,21.0
xg,0.0,1.64,27.1
xag,0.0,1.15,15.4
npxg,0.0,1.47,24.0


In [10]:
# --- –£–¥–∞–ª—è–µ–º –¥—É–±–ª–∏–∫–∞—Ç—ã –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ–º —Ñ–∏–Ω–∞–ª—å–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç ---
before = len(players_la)
players_la = players_la.drop_duplicates(subset=["season", "player", "squad"], keep="first")
after = len(players_la)
print(f"‚úÖ –£–±—Ä–∞–Ω–æ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤: {before - after}, —Ñ–∏–Ω–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä: {after}")

out_path = OUT / "laliga_players_all.csv"
players_la.to_csv(out_path, index=False)
print(f"üìÅ –§–∏–Ω–∞–ª—å–Ω—ã–π —Ñ–∞–π–ª —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤: {out_path}")

‚úÖ –£–±—Ä–∞–Ω–æ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤: 134, —Ñ–∏–Ω–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä: 3581
üìÅ –§–∏–Ω–∞–ª—å–Ω—ã–π —Ñ–∞–π–ª —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤: /Users/kekc/Projects/GIT/sports-stats-analysis/data/processed/players/laliga_players_all.csv
