In [11]:
import os
import numpy as np
import pandas as pd

# Define project paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

print("CMD:", os.getcwd())
print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)
print("FILES in data:", os.listdir(DATA_DIR))

CMD: /Users/quentin/Desktop/hackathon-2025-evan-ston-energy/src/data_helper
PROJECT_ROOT: /Users/quentin/Desktop/hackathon-2025-evan-ston-energy
DATA_DIR: /Users/quentin/Desktop/hackathon-2025-evan-ston-energy/data
FILES in data: ['classification_outputs.parquet', 'game_lineups.csv', 'competitions.csv', '.DS_Store', 'appearances.csv', 'player_valuations.csv', 'game_events.csv', 'transfers.csv', 'players.csv', 'games.csv', 'club_games.csv', 'processed', 'clubs.csv']


In [12]:
print("Loading data files...")

players = pd.read_csv(os.path.join(DATA_DIR, "players.csv"))
valuations = pd.read_csv(os.path.join(DATA_DIR, "player_valuations.csv"))
apps = pd.read_csv(os.path.join(DATA_DIR, "appearances.csv"))
games = pd.read_csv(os.path.join(DATA_DIR, "games.csv"))
clubs = pd.read_csv(os.path.join(DATA_DIR, "clubs.csv"))
comps = pd.read_csv(os.path.join(DATA_DIR, "competitions.csv"))
transfers = pd.read_csv(os.path.join(DATA_DIR, "transfers.csv"))
club_games = pd.read_csv(os.path.join(DATA_DIR, "club_games.csv"))

print("‚úÖ Data loaded successfully!")
print(f"  Players: {len(players):,} rows")
print(f"  Valuations: {len(valuations):,} rows")
print(f"  Appearances: {len(apps):,} rows")
print(f"  Games: {len(games):,} rows")
print(f"  Clubs: {len(clubs):,} rows")
print(f"  Competitions: {len(comps):,} rows")
print(f"  Transfers: {len(transfers):,} rows")
print(f"  Club Games: {len(club_games):,} rows")

Loading data files...
‚úÖ Data loaded successfully!
  Players: 32,601 rows
  Valuations: 496,606 rows
  Appearances: 1,706,806 rows
  Games: 74,026 rows
  Clubs: 439 rows
  Competitions: 44 rows
  Transfers: 79,646 rows
  Club Games: 148,052 rows


In [13]:
print("\n" + "="*80)
print("CREATING BASE SNAPSHOT TABLE")
print("="*80)

# Start from valuations (each valuation date = one snapshot)
snap = valuations.copy()
snap["snapshot_date"] = pd.to_datetime(snap["date"])
snap = snap.sort_values(["player_id", "snapshot_date"])

# Create future market value for y_growth label
snap["future_snapshot_date"] = snap.groupby("player_id")["snapshot_date"].shift(-1)
snap["future_market_value"] = snap.groupby("player_id")["market_value_in_eur"].shift(-1)

# Calculate days to future valuation
snap["delta_days_to_future"] = (
    snap["future_snapshot_date"] - snap["snapshot_date"]
).dt.days

# Keep only snapshots with 180-540 days to future (approximately 6-18 months)
valid_mask = snap["delta_days_to_future"].between(180, 540)
snap = snap[valid_mask].copy()

# Calculate y_growth (log-growth label for regression)
snap["y_growth"] = np.log(snap["future_market_value"] + 1) - np.log(
    snap["market_value_in_eur"] + 1
)

print(f"Created {len(snap):,} valid snapshots")
print(f"Unique players: {snap['player_id'].nunique():,}")

# Show sample
snap[["player_id", "snapshot_date", "market_value_in_eur",
      "future_market_value", "delta_days_to_future", "y_growth"]].head()


CREATING BASE SNAPSHOT TABLE
Created 183,127 valid snapshots
Unique players: 28,384


Unnamed: 0,player_id,snapshot_date,market_value_in_eur,future_market_value,delta_days_to_future,y_growth
4755,10,2006-01-09,20000000,30000000.0,187.0,0.405465
6254,10,2006-07-15,30000000,23000000.0,341.0,-0.265703
9024,10,2007-06-21,23000000,20000000.0,349.0,-0.139762
16465,10,2008-06-04,20000000,18000000.0,371.0,-0.105361
38770,10,2010-05-08,7500000,7000000.0,249.0,-0.068993


In [14]:
print("\n" + "="*80)
print("ADDING PLAYER INFORMATION")
print("="*80)

# Prepare player data
p = players.copy()
p["date_of_birth"] = pd.to_datetime(p["date_of_birth"], errors="coerce")
p["contract_expiration_date"] = pd.to_datetime(
    p["contract_expiration_date"], errors="coerce"
)

# Calculate age at snapshot date
snap = snap.merge(p[["player_id", "name", "date_of_birth", "position", "sub_position", 
                     "foot", "height_in_cm", "country_of_citizenship", 
                     "current_club_id", "current_club_domestic_competition_id",
                     "current_club_name", "market_value_in_eur", "highest_market_value_in_eur",
                     "contract_expiration_date"]], 
                  on="player_id", how="left", suffixes=("", "_player"))

# Calculate age
snap["age"] = (snap["snapshot_date"] - snap["date_of_birth"]).dt.days / 365.25

# Calculate years to contract end
snap["years_to_contract_end"] = (
    (snap["contract_expiration_date"] - snap["snapshot_date"]).dt.days / 365.25
)

# Market value ratio to peak
snap["mv_ratio_to_peak"] = (
    snap["market_value_in_eur"] / (snap["highest_market_value_in_eur"] + 1)
)

print("‚úÖ Player information added")
snap[["player_id", "name", "age", "position", "market_value_in_eur", "y_growth"]].head()


ADDING PLAYER INFORMATION
‚úÖ Player information added


Unnamed: 0,player_id,name,age,position,market_value_in_eur,y_growth
0,10,Miroslav Klose,27.586585,Attack,20000000,0.405465
1,10,Miroslav Klose,28.098563,Attack,30000000,-0.265703
2,10,Miroslav Klose,29.03217,Attack,23000000,-0.139762
3,10,Miroslav Klose,29.98768,Attack,20000000,-0.105361
4,10,Miroslav Klose,31.912389,Attack,7500000,-0.068993


In [15]:
print("\n" + "="*80)
print("ADDING SEASON PERFORMANCE FEATURES")
print("="*80)

# --- 1. Unified season_year definition (European style: season runs Jul‚ÄìJun) ---
def get_season_year_from_date(d):
    if pd.isna(d):
        return np.nan
    year = d.year
    # If date is in Jul‚ÄìDec -> season year = this year
    # If date is in Jan‚ÄìJun -> season year = previous year
    return year if d.month >= 7 else year - 1

# Make sure game dates are datetime and compute season_year for games
games["date"] = pd.to_datetime(games["date"], errors="coerce")
games["season_year"] = games["date"].apply(get_season_year_from_date)

# --- 2. Attach season_year to appearances and aggregate by player-season ---
apps2 = apps.merge(
    games[["game_id", "season_year"]],
    on="game_id",
    how="left",
)

perf_season = (
    apps2.groupby(["player_id", "season_year"])
    .agg(
        games_played=("appearance_id", "count"),
        minutes_total=("minutes_played", "sum"),
        goals_total=("goals", "sum"),
        assists_total=("assists", "sum"),
    )
    .reset_index()
)

# Calculate per-90 metrics
perf_season["goals_per_90"] = (
    perf_season["goals_total"]
    / perf_season["minutes_total"].replace(0, np.nan)
    * 90
)
perf_season["assists_per_90"] = (
    perf_season["assists_total"]
    / perf_season["minutes_total"].replace(0, np.nan)
    * 90
)
perf_season["minutes_per_game"] = (
    perf_season["minutes_total"]
    / perf_season["games_played"].replace(0, np.nan)
)
# Treat minutes_per_game as minutes_per_90 for consistency with other /90 stats
perf_season["minutes_per_90"] = perf_season["minutes_per_game"]


# Clean infs
for col in ["goals_per_90", "assists_per_90", "minutes_per_game"]:
    perf_season[col] = perf_season[col].replace([np.inf, -np.inf], np.nan).fillna(0.0)

# --- 3. Map snapshot_date to the SAME season_year logic ---
snap["snapshot_date"] = pd.to_datetime(snap["snapshot_date"], errors="coerce")
snap["season_year"] = snap["snapshot_date"].apply(get_season_year_from_date)

# --- 4. Create complete player-season grid and merge performance ---
all_players = snap["player_id"].unique()
all_seasons = sorted(
    set(snap["season_year"].dropna().unique())
    | set(perf_season["season_year"].dropna().unique())
)

player_season_grid = pd.MultiIndex.from_product(
    [all_players, all_seasons], names=["player_id", "season_year"]
).to_frame(index=False)

perf_complete = player_season_grid.merge(
    perf_season,
    on=["player_id", "season_year"],
    how="left",
)

# --- 5. Forward/backward fill performance (including goals_total/assists_total) ---
perf_cols = [
    "minutes_total",
    "goals_per_90",
    "assists_per_90",
    "minutes_per_game",
    "minutes_per_90",   # NEW
    "games_played",
    "goals_total",
    "assists_total",
]


perf_complete = perf_complete.sort_values(["player_id", "season_year"])

for col in perf_cols:
    if col in perf_complete.columns:
        # Forward then backward fill within each player
        perf_complete[col] = perf_complete.groupby("player_id")[col].ffill()
        perf_complete[col] = perf_complete.groupby("player_id")[col].bfill()
        # Any remaining NaNs ‚Üí 0
        perf_complete[col] = perf_complete[col].fillna(0.0)

# --- 6. Join filled performance back to snapshots ---
snap = snap.merge(
    perf_complete,
    on=["player_id", "season_year"],
    how="left",
)

# --- 7. Calculate deltas (growth from previous season) ---
snap = snap.sort_values(["player_id", "season_year"])
for col in ["minutes_total", "goals_per_90", "assists_per_90",
            "minutes_per_game", "minutes_per_90"]:  # added minutes_per_90
    if col in snap.columns:
        snap[f"prev_{col}"] = snap.groupby("player_id")[col].shift(1)
        snap[f"prev_{col}"] = snap[f"prev_{col}"].fillna(0)
        snap[f"delta_{col}"] = snap[col] - snap[f"prev_{col}"]


print("‚úÖ Season performance features added (aligned + filled)")



ADDING SEASON PERFORMANCE FEATURES
‚úÖ Season performance features added (aligned + filled)


In [16]:
print("\n" + "="*80)
print("ADDING CLUB AND LEAGUE FEATURES")
print("="*80)

# Club info
c = clubs[
    ["club_id", "name", "domestic_competition_id", "total_market_value",
     "squad_size", "average_age"]
].rename(columns={"name": "club_name", "total_market_value": "club_total_market_value"})

# League strength (average club market value in each competition)
league_strength_df = (
    clubs[clubs["total_market_value"].notna()]
    .groupby("domestic_competition_id")["total_market_value"]
    .mean()
    .reset_index()
    .rename(columns={"total_market_value": "league_strength"})
)

# Competition info
comp = comps[
    ["competition_id", "competition_code", "name", "country_name", "is_major_national_league"]
].rename(
    columns={
        "competition_id": "domestic_competition_id",
        "name": "league_name",
        "country_name": "league_country",
    }
)
comp = comp.merge(league_strength_df, on="domestic_competition_id", how="left")

# Join club info
snap = snap.merge(
    c, left_on="current_club_id", right_on="club_id", how="left", suffixes=("", "_club")
)

# Get competition ID for merge
if "domestic_competition_id" in snap.columns:
    snap["comp_id_for_merge"] = snap["domestic_competition_id"]
else:
    snap["comp_id_for_merge"] = None

if "current_club_domestic_competition_id" in snap.columns:
    snap["comp_id_for_merge"] = snap["comp_id_for_merge"].fillna(
        snap["current_club_domestic_competition_id"]
    )

# If still missing, get from club directly
if snap["comp_id_for_merge"].isna().any():
    club_comp_map = clubs[["club_id", "domestic_competition_id"]].drop_duplicates()
    snap = snap.merge(
        club_comp_map, 
        left_on="current_club_id", 
        right_on="club_id", 
        how="left", 
        suffixes=("", "_from_club")
    )
    if "domestic_competition_id_from_club" in snap.columns:
        snap["comp_id_for_merge"] = snap["comp_id_for_merge"].fillna(
            snap["domestic_competition_id_from_club"]
        )
        snap = snap.drop(columns=["domestic_competition_id_from_club"], errors="ignore")

# Join league info
snap = snap.merge(
    comp,
    left_on="comp_id_for_merge",
    right_on="domestic_competition_id",
    how="left",
    suffixes=("", "_league"),
)

# Fill missing club_total_market_value
if "club_total_market_value" in snap.columns:
    snap["club_total_market_value"] = snap.groupby("comp_id_for_merge")[
        "club_total_market_value"
    ].transform(lambda x: x.fillna(x.median()))
    snap["club_total_market_value"] = snap["club_total_market_value"].fillna(
        snap["league_strength"]
    )
    overall_median = c["club_total_market_value"].median()
    snap["club_total_market_value"] = snap["club_total_market_value"].fillna(
        overall_median if pd.notna(overall_median) else 50000000
    )

# Fill missing league_strength
if "league_strength" in snap.columns:
    overall_avg = league_strength_df["league_strength"].mean()
    snap["league_strength"] = snap["league_strength"].fillna(
        overall_avg if pd.notna(overall_avg) else 50000000
    )

# League indicators
snap["league_is_major"] = snap["is_major_national_league"].fillna(False).astype("int8")
top5_countries = {"England", "Spain", "Germany", "Italy", "France"}
snap["is_top5_league"] = snap["league_country"].isin(top5_countries).astype("int8")

print("‚úÖ Club and league features added")


ADDING CLUB AND LEAGUE FEATURES
‚úÖ Club and league features added


In [17]:
print("\n" + "="*80)
print("ADDING CLUB SEASON STATS")
print("="*80)

# Join games to club_games
cg = club_games.merge(
    games[["game_id", "date", "season"]],
    on="game_id",
    how="left",
)

# Aggregate by club-season
club_season_stats = (
    cg.groupby(["club_id", "season"])
    .agg(
        club_games_played=("game_id", "count"),
        club_wins=("is_win", "sum"),
        club_goals_for=("own_goals", "sum"),
        club_goals_against=("opponent_goals", "sum"),
    )
    .reset_index()
)

# Calculate rates
club_season_stats["club_win_rate"] = (
    club_season_stats["club_wins"] / 
    club_season_stats["club_games_played"].replace(0, np.nan)
)
club_season_stats["club_goal_diff_per_game"] = (
    (club_season_stats["club_goals_for"] - club_season_stats["club_goals_against"])
    / club_season_stats["club_games_played"].replace(0, np.nan)
)

club_season_stats = club_season_stats.rename(columns={"season": "season_year"})

# Create complete club-season grid
all_clubs = snap["current_club_id"].dropna().unique()
all_seasons_club = sorted(snap["season_year"].unique())

club_season_grid = pd.MultiIndex.from_product(
    [all_clubs, all_seasons_club], names=["club_id", "season_year"]
).to_frame(index=False)

club_season_complete = club_season_grid.merge(
    club_season_stats, on=["club_id", "season_year"], how="left"
)

# Fill missing values
club_season_complete = club_season_complete.sort_values(["club_id", "season_year"])
club_stat_cols = ["club_win_rate", "club_goal_diff_per_game", "club_games_played"]
for col in club_stat_cols:
    if col in club_season_complete.columns:
        club_season_complete[col] = club_season_complete.groupby("club_id")[col].ffill()
        club_season_complete[col] = club_season_complete.groupby("club_id")[col].bfill()

# Fill with league/season averages
if "club_win_rate" in club_season_complete.columns:
    league_avg = club_season_complete.groupby("season_year")["club_win_rate"].transform("mean")
    club_season_complete["club_win_rate"] = club_season_complete["club_win_rate"].fillna(league_avg)
    club_season_complete["club_win_rate"] = club_season_complete["club_win_rate"].fillna(0.5)

if "club_goal_diff_per_game" in club_season_complete.columns:
    club_season_complete["club_goal_diff_per_game"] = (
        club_season_complete["club_goal_diff_per_game"].fillna(0)
    )

# Join to snapshots
snap = snap.merge(
    club_season_complete,
    left_on=["current_club_id", "season_year"],
    right_on=["club_id", "season_year"],
    how="left",
    suffixes=("", "_club_season"),
)

print("‚úÖ Club season stats added")


ADDING CLUB SEASON STATS
‚úÖ Club season stats added


In [18]:
print("\n" + "=" * 80)
print("ADDING REQUIRED INPUT FEATURES (SPEC)")
print("=" * 80)

# ---------------------------------------------------------------------
# 1) position_group ‚Äî GK / DF / MF / FW
# ---------------------------------------------------------------------
def map_position_group(pos):
    if pd.isna(pos):
        return np.nan
    pos = str(pos).lower()

    # Goalkeeper
    if "keeper" in pos or pos == "gk":
        return "GK"

    # Defenders
    if any(
        k in pos
        for k in [
            "back",
            "defend",
            "centre-back",
            "center-back",
            "cb",
            "fullback",
            "wing back",
            "wing-back",
        ]
    ):
        return "DF"

    # Midfielders
    if any(k in pos for k in ["midfield", "dm", "am", "cm", "lm", "rm", "mid"]):
        return "MF"

    # Everything else ‚Üí treat as forward / attacker
    return "FW"


snap["position_group"] = snap["position"].apply(map_position_group)

# ---------------------------------------------------------------------
# 2) league_level ‚Äî league strength tier
#    1 = top 5 leagues
#    2 = other "major" leagues
#    3 = all others
# ---------------------------------------------------------------------
if "league_level" not in snap.columns:
    if ("is_top5_league" in snap.columns) or ("league_is_major" in snap.columns):
        snap["league_level"] = 3  # default: other leagues

        if "league_is_major" in snap.columns:
            snap.loc[snap["league_is_major"] == 1, "league_level"] = 2

        if "is_top5_league" in snap.columns:
            snap.loc[snap["is_top5_league"] == 1, "league_level"] = 1

    elif "league_strength" in snap.columns:
        # Fallback: derive levels from league_strength quantiles
        q_low, q_high = snap["league_strength"].quantile([0.33, 0.66]).values

        def league_level_from_strength(x):
            if pd.isna(x):
                return 3
            if x >= q_high:
                return 1
            if x >= q_low:
                return 2
            return 3

        snap["league_level"] = snap["league_strength"].apply(league_level_from_strength)

    snap["league_level"] = snap["league_level"].astype("int8", errors="ignore")

# ---------------------------------------------------------------------
# 3) mv_1y_change ‚Äî yearly market value change (log-scale)
#    Use season-average MV and compare to previous season for each player.
# ---------------------------------------------------------------------
mv_year = (
    snap.groupby(["player_id", "season_year"])["market_value_in_eur"]
    .mean()
    .reset_index()
    .sort_values(["player_id", "season_year"])
)

mv_year["prev_mv_year"] = mv_year.groupby("player_id")["market_value_in_eur"].shift(1)
mv_year["mv_1y_change"] = np.log(mv_year["market_value_in_eur"] + 1) - np.log(
    mv_year["prev_mv_year"] + 1
)

snap = snap.merge(
    mv_year[["player_id", "season_year", "mv_1y_change"]],
    on=["player_id", "season_year"],
    how="left",
)

snap["mv_1y_change"] = snap["mv_1y_change"].fillna(0.0)

# ---------------------------------------------------------------------
# 4) perf_1y_change ‚Äî yearly change in performance composite
#    Here we use only goals_per_90 and assists_per_90 (no ratings available)
#    perf_base = 0.6*g90 + 0.4*a90
# ---------------------------------------------------------------------
perf_year = (
    snap.groupby(["player_id", "season_year"])
    .agg(
        goals_per_90=("goals_per_90", "mean"),
        assists_per_90=("assists_per_90", "mean"),
    )
    .reset_index()
    .sort_values(["player_id", "season_year"])
)

# Build performance composite (only from goals/assists)
perf_year["perf_base"] = (
    0.6 * perf_year["goals_per_90"].fillna(0)
    + 0.4 * perf_year["assists_per_90"].fillna(0)
)

perf_year["prev_perf_base"] = perf_year.groupby("player_id")["perf_base"].shift(1)
perf_year["perf_1y_change"] = perf_year["perf_base"] - perf_year["prev_perf_base"]

snap = snap.merge(
    perf_year[["player_id", "season_year", "perf_1y_change"]],
    on=["player_id", "season_year"],
    how="left",
)

snap["perf_1y_change"] = snap["perf_1y_change"].fillna(0.0)

print("‚úÖ Required input features created: position_group, league_level, mv_1y_change, perf_1y_change")



ADDING REQUIRED INPUT FEATURES (SPEC)
‚úÖ Required input features created: position_group, league_level, mv_1y_change, perf_1y_change


In [19]:
print("\n" + "="*80)
print("ADDING TRANSFER FEATURES")
print("="*80)

transfers2 = transfers.copy()
transfers2["transfer_date"] = pd.to_datetime(transfers2["transfer_date"], errors="coerce")

# Convert season string to year
def season_str_to_year(s):
    if pd.isna(s):
        return np.nan
    first = int(str(s).split("/")[0])
    return 2000 + first

transfers2["season_year"] = transfers2["transfer_season"].apply(season_str_to_year)

# Calculate moved_to_bigger_club flag
from_club_mv = clubs[["club_id", "total_market_value"]].rename(
    columns={"club_id": "from_club_id", "total_market_value": "from_club_mv"}
)
to_club_mv = clubs[["club_id", "total_market_value"]].rename(
    columns={"club_id": "to_club_id", "total_market_value": "to_club_mv"}
)

transfers2 = transfers2.merge(from_club_mv, on="from_club_id", how="left")
transfers2 = transfers2.merge(to_club_mv, on="to_club_id", how="left")
transfers2["moved_to_bigger_club"] = (
    transfers2["to_club_mv"] > transfers2["from_club_mv"]
).astype("int8")

# Aggregate by player-season
transfer_season = (
    transfers2.groupby(["player_id", "season_year"])
    .agg(
        has_recent_transfer_count=("transfer_date", "count"),
        moved_to_bigger_club_flag=("moved_to_bigger_club", "max"),
    )
    .reset_index()
)
transfer_season["has_recent_transfer"] = (
    transfer_season["has_recent_transfer_count"] > 0
).astype("int8")

# Join to snapshots
snap = snap.merge(
    transfer_season[["player_id", "season_year", "has_recent_transfer", "moved_to_bigger_club_flag"]],
    on=["player_id", "season_year"],
    how="left",
)

snap["has_recent_transfer"] = snap["has_recent_transfer"].fillna(0).astype("int8")
snap["moved_to_bigger_club_flag"] = snap["moved_to_bigger_club_flag"].fillna(0).astype("int8")

print("‚úÖ Transfer features added")


ADDING TRANSFER FEATURES
‚úÖ Transfer features added


In [None]:
print("\n" + "="*80)
print("FINAL CLEANUP AND SAVE")
print("="*80)

# Filter to 2010+
snap = snap[snap["snapshot_date"].dt.year >= 2010].copy()

# Fill critical nulls
if "position" in snap.columns:
    snap["age"] = snap.groupby("position")["age"].transform(
        lambda x: x.fillna(x.median() if x.median() > 0 else 25)
    )
snap["age"] = snap["age"].fillna(25)

if "height_in_cm" in snap.columns:
    if "position" in snap.columns:
        snap["height_in_cm"] = snap.groupby("position")["height_in_cm"].transform(
            lambda x: x.fillna(x.median())
        )
    snap["height_in_cm"] = snap["height_in_cm"].fillna(180)

snap["years_to_contract_end"] = snap["years_to_contract_end"].fillna(5)

# Performance metrics
perf_cols_final = ["minutes_total", "goals_per_90", "assists_per_90", 
                   "minutes_per_game", "games_played"]
for col in perf_cols_final:
    if col in snap.columns:
        snap[col] = snap[col].fillna(0)

# Delta columns
delta_cols = ["delta_minutes_total", "delta_goals_per_90", "delta_assists_per_90"]
for col in delta_cols:
    if col in snap.columns:
        snap[col] = snap[col].fillna(0)

# Club stats
snap["club_win_rate"] = snap["club_win_rate"].fillna(0.5)
snap["club_goal_diff_per_game"] = snap["club_goal_diff_per_game"].fillna(0)

# Set foot default to "right"
if "foot" in snap.columns:
    snap["foot"] = snap["foot"].fillna("right")

# Select core columns (BASIC VERSION - no extra variables)
core_cols = [
    # Keys
    "player_id", "snapshot_date", "season_year",
    "name", "age", "position", "position_group",
    "sub_position", "foot", "height_in_cm", "country_of_citizenship",
    
    # Market
    "market_value_in_eur", "highest_market_value_in_eur", "mv_ratio_to_peak",
    "y_growth", "future_market_value", "years_to_contract_end",
    "mv_1y_change",
    
    # Performance level
    # Performance level
    "minutes_total", "games_played", "minutes_per_game", "minutes_per_90",
    "goals_total", "assists_total",
    "goals_per_90", "assists_per_90",
    
    # Performance growth / momentum
    "delta_minutes_total", "delta_minutes_per_90",
    "delta_goals_per_90", "delta_assists_per_90",
    "perf_1y_change",

    
    # Club & league
    "current_club_id", "club_name", "club_total_market_value", "club_win_rate",
    "club_goal_diff_per_game", "league_name", "league_country",
    "league_strength", "league_is_major", "is_top5_league", "league_level",
    
    # Transfer
    "has_recent_transfer", "moved_to_bigger_club_flag",
]

# Keep only columns that actually exist
core_cols = [c for c in core_cols if c in snap.columns]
player_snapshot = snap[core_cols].copy()


print("‚úÖ Final cleanup complete")
print(f"\nFinal snapshot: {len(player_snapshot):,} rows √ó {len(player_snapshot.columns)} columns")

# ============================================================================
# DATA QUALITY CHECKS
# ============================================================================
print("\n" + "="*80)
print("RUNNING DATA QUALITY CHECKS")
print("="*80)

def check_nulls(df, name="DataFrame"):
    """Check for null values in the dataframe"""
    print(f"\n[{name}] Null Value Check:")
    print("-" * 80)
    
    null_counts = df.isnull().sum()
    null_pct = (null_counts / len(df) * 100).round(2)
    
    null_df = pd.DataFrame({
        'Column': null_counts.index,
        'Null_Count': null_counts.values,
        'Null_Percentage': null_pct.values
    })
    null_df = null_df[null_df['Null_Count'] > 0].sort_values('Null_Percentage', ascending=False)
    
    if len(null_df) == 0:
        print("‚úÖ No null values found!")
    else:
        print(f"‚ö†Ô∏è  Found {len(null_df)} columns with null values:")
        for _, row in null_df.iterrows():
            print(f"  - {row['Column']}: {row['Null_Count']:,} ({row['Null_Percentage']:.2f}%)")
    
    return null_df

def test_modeling_readiness(df, name="DataFrame"):
    """Comprehensive test to check if data is ready for modeling"""
    print(f"\n{'='*80}")
    print(f"MODELING READINESS TEST: {name}")
    print("="*80)
    
    issues = []
    all_passed = True
    
    # Test 1: Basic Information
    print(f"\n[1] Basic Information:")
    print(f"    Total rows: {len(df):,}")
    print(f"    Total columns: {len(df.columns)}")
    
    if len(df) == 0:
        issues.append("DataFrame is empty!")
        all_passed = False
        return False, issues
    
    # Test 2: Target Variable Check
    print(f"\n[2] Target Variable Check:")
    if 'y_growth' in df.columns:
        target_nulls = df['y_growth'].isnull().sum()
        target_pct = (target_nulls / len(df) * 100)
        if target_nulls == 0:
            print(f"    ‚úÖ Target 'y_growth' found")
            print(f"    Nulls: {target_nulls} ({target_pct:.2f}%)")
        else:
            print(f"    ‚ö†Ô∏è  Target 'y_growth' has {target_nulls} nulls ({target_pct:.2f}%)")
            issues.append(f"Target variable has {target_pct:.2f}% nulls")
            all_passed = False
    else:
        print(f"    ‚ùå Target 'y_growth' NOT found!")
        issues.append("Target variable 'y_growth' is missing")
        all_passed = False
    
    # Test 3: Null Value Analysis
    print(f"\n[3] Null Value Analysis:")
    null_counts = df.isnull().sum()
    null_pct = (null_counts / len(df) * 100)
    
    # Expected nulls (from original CSV)
    expected_nulls = ['foot', 'country_of_citizenship', 'sub_position']
    
    critical_cols = [
        'market_value_in_eur', 'age', 'position', 'minutes_total', 
        'goals_per_90', 'club_total_market_value', 'league_strength'
    ]
    
    high_null_cols = []
    for col in df.columns:
        if col in expected_nulls:
            continue  # Skip expected nulls
        pct = null_pct[col]
        if pct > 5:  # More than 5% nulls
            high_null_cols.append((col, pct))
            if col in critical_cols:
                issues.append(f"Critical column '{col}' has {pct:.2f}% nulls")
                all_passed = False
                print(f"    ‚ùå {col:35s} {pct:6.2f}% nulls (max: 5%)")
            else:
                print(f"    ‚ö†Ô∏è  {col:35s} {pct:6.2f}% nulls (max: 5%)")
        else:
            print(f"    ‚úÖ {col:35s} {pct:6.2f}% nulls (max: 0%)")
    
    if len(high_null_cols) == 0:
        print(f"    ‚úÖ All columns have acceptable null percentages")
    
    # Test 4: Data Type Check
    print(f"\n[4] Data Type Check:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    print(f"    Numeric columns: {len(numeric_cols)}")
    print(f"    Categorical columns: {len(categorical_cols)}")
    
    # Test 5: Feature Completeness
    print(f"\n[5] Feature Completeness:")
    expected_features = [
        'player_id', 'snapshot_date', 'y_growth', 'market_value_in_eur',
        'age', 'position', 'goals_per_90', 'assists_per_90',
        'club_total_market_value', 'league_strength'
    ]
    missing_features = [f for f in expected_features if f not in df.columns]
    if len(missing_features) == 0:
        print(f"    ‚úÖ All expected features present")
    else:
        print(f"    ‚ö†Ô∏è  Missing features: {missing_features}")
        issues.append(f"Missing features: {missing_features}")
        all_passed = False
    
    # Test 6: Data Quality Checks
    print(f"\n[6] Data Quality Checks:")
    # Check for infinite values
    inf_cols = []
    for col in numeric_cols:
        if np.isinf(df[col]).any():
            inf_cols.append(col)
    
    if len(inf_cols) == 0:
        print(f"    ‚úÖ No infinite values")
    else:
        print(f"    ‚ö†Ô∏è  Infinite values found in: {inf_cols}")
        issues.append(f"Infinite values in: {inf_cols}")
        all_passed = False
    
    # Check for extreme outliers in target
    if 'y_growth' in df.columns:
        target_clean = df['y_growth'].dropna()
        q1, q99 = target_clean.quantile([0.01, 0.99])
        outliers = ((target_clean < q1) | (target_clean > q99)).sum()
        outlier_pct = (outliers / len(target_clean) * 100)
        print(f"    Target outliers (1%-99%): {outliers:,} ({outlier_pct:.2f}%)")
        if outlier_pct > 5:
            print(f"    ‚ö†Ô∏è  High outlier percentage (consider clipping)")
    
    # Test 7: Summary Statistics
    print(f"\n[7] Summary Statistics:")
    if 'y_growth' in df.columns:
        target_stats = df['y_growth'].describe()
        print(f"    Target (y_growth) stats:")
        print(f"      Mean: {target_stats['mean']:.4f}")
        print(f"      Std:  {target_stats['std']:.4f}")
        print(f"      Min:  {target_stats['min']:.4f}")
        print(f"      Max:  {target_stats['max']:.4f}")
    
    # Final Verdict
    print("\n" + "=" * 80)
    if all_passed and len(issues) == 0:
        print("‚úÖ RESULT: Data is READY for modeling!")
        print("=" * 80)
    else:
        print("‚ö†Ô∏è  RESULT: Data has some issues that should be addressed:")
        print("=" * 80)
        for i, issue in enumerate(issues, 1):
            print(f"  {i}. {issue}")
        print("\nüí° Recommendation: Review and fix the issues above before modeling.")
    
    return all_passed, issues

# Run null check
null_report = check_nulls(player_snapshot, "Final Player Snapshot")

# Run modeling readiness test
is_ready, issues = test_modeling_readiness(player_snapshot, "Final Player Snapshot")

# Save to parquet
out_path = os.path.join(PROCESSED_DIR, "player_snapshot.parquet")
player_snapshot.to_parquet(out_path, index=False)
print(f"\n‚úÖ Saved to: {out_path}")

# Preview
print("\n" + "="*80)
print("DATA PREVIEW")
print("="*80)
player_snapshot.head()


FINAL CLEANUP AND SAVE
‚úÖ Final cleanup complete

Final snapshot: 169,226 rows √ó 43 columns

RUNNING DATA QUALITY CHECKS

[Final Player Snapshot] Null Value Check:
--------------------------------------------------------------------------------
‚ö†Ô∏è  Found 2 columns with null values:
  - country_of_citizenship: 1,845 (1.09%)
  - sub_position: 294 (0.17%)

MODELING READINESS TEST: Final Player Snapshot

[1] Basic Information:
    Total rows: 169,226
    Total columns: 43

[2] Target Variable Check:
    ‚úÖ Target 'y_growth' found
    Nulls: 0 (0.00%)

[3] Null Value Analysis:
    ‚úÖ player_id                             0.00% nulls (max: 0%)
    ‚úÖ snapshot_date                         0.00% nulls (max: 0%)
    ‚úÖ season_year                           0.00% nulls (max: 0%)
    ‚úÖ name                                  0.00% nulls (max: 0%)
    ‚úÖ age                                   0.00% nulls (max: 0%)
    ‚úÖ position                              0.00% nulls (max: 0%)
    ‚

Unnamed: 0,player_id,snapshot_date,season_year,name,age,position,position_group,sub_position,foot,height_in_cm,...,club_win_rate,club_goal_diff_per_game,league_name,league_country,league_strength,league_is_major,is_top5_league,league_level,has_recent_transfer,moved_to_bigger_club_flag
4,10,2010-05-08,2009,Miroslav Klose,31.912389,Attack,FW,Centre-Forward,right,184.0,...,0.526316,0.561404,serie-a,Italy,50000000.0,1,1,1,0,0
5,10,2011-06-29,2010,Miroslav Klose,33.054073,Attack,FW,Centre-Forward,right,184.0,...,0.526316,0.561404,serie-a,Italy,50000000.0,1,1,1,0,0
6,10,2012-07-03,2012,Miroslav Klose,34.067077,Attack,FW,Centre-Forward,right,184.0,...,0.526316,0.561404,serie-a,Italy,50000000.0,1,1,1,0,0
7,10,2013-06-19,2012,Miroslav Klose,35.028063,Attack,FW,Centre-Forward,right,184.0,...,0.526316,0.561404,serie-a,Italy,50000000.0,1,1,1,0,0
8,10,2014-01-07,2013,Miroslav Klose,35.581109,Attack,FW,Centre-Forward,right,184.0,...,0.387755,-0.020408,serie-a,Italy,50000000.0,1,1,1,0,0
