# Player Snapshot Feature Engineering

依照 `feature_engineering_idea.md` 的設計，把 Transfermarkt 原始 csv 壓成單一的玩家 snapshot table。每個 row 代表 `(player_id, snapshot_date)` 的狀態，所有特徵都只依賴當下時間點之前的資訊。

In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import re

pd.set_option('display.max_columns', 120)

PROJECT_ROOT = Path.cwd()
override_root = os.environ.get('PROJECT_ROOT_OVERRIDE')
if override_root:
    PROJECT_ROOT = Path(override_root)

DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = DATA_DIR / 'processed'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH = OUTPUT_DIR / 'player_snapshot_features.csv'

print(f'Project root: {PROJECT_ROOT}')
print(f'Output path: {OUTPUT_PATH}')


Project root: /Users/wuyusen/Desktop/Northwestern MLDS/2025 MLDS Hackathon/hackathon-2025-evan-ston-energy
Output path: /Users/wuyusen/Desktop/Northwestern MLDS/2025 MLDS Hackathon/hackathon-2025-evan-ston-energy/data/processed/player_snapshot_features.csv


In [5]:
# Helper functions for value parsing and rolling window aggregations.
def parse_euro_value(value: str) -> float:
    """Normalize Transfermarkt value strings (e.g. '+€3.05m') into floats."""
    if pd.isna(value):
        return np.nan
    text = str(value).strip()
    if not text:
        return np.nan
    text = text.replace('€', '').replace(',', '').replace('+', '').strip()
    if text in {'', '-'}:
        return np.nan
    multiplier = 1.0
    lower = text.lower()
    if lower.endswith('bn'):
        multiplier = 1e9
        text = text[:-2]
    elif lower.endswith('m'):
        multiplier = 1e6
        text = text[:-1]
    elif lower.endswith('k'):
        multiplier = 1e3
        text = text[:-1]
    cleaned = re.sub(r'[^0-9.\-]', '', text)
    if cleaned in {'', '-'}:
        return np.nan
    return float(cleaned) * multiplier


def compute_cumulative_table(df, entity_col, date_col, aggregations):
    """Aggregate metrics per day and build cumulative sums for merge_asof windows."""
    if df.empty:
        return pd.DataFrame(columns=[entity_col, date_col]), []
    working = df.dropna(subset=[entity_col, date_col]).copy()
    working[date_col] = pd.to_datetime(working[date_col])
    named_aggs = {
        feature_name: pd.NamedAgg(column=source_col, aggfunc=agg_func)
        for feature_name, (source_col, agg_func) in aggregations.items()
    }
    grouped = (
        working.sort_values([entity_col, date_col])
        .groupby([entity_col, date_col], as_index=False)
        .agg(**named_aggs)
    )
    cumulative_cols = []
    for feature_name in aggregations.keys():
        cum_col = f'cum_{feature_name}'
        grouped[cum_col] = grouped.groupby(entity_col)[feature_name].cumsum()
        cumulative_cols.append(cum_col)
    return grouped[[entity_col, date_col] + cumulative_cols], cumulative_cols


def window_aggregate(
    base_df,
    cumulative_df,
    base_entity_col,
    base_date_col,
    entity_col,
    date_col,
    cumulative_cols,
    window_days,
    prefix,
    offset_days=0,
):
    """Vectorized rolling window lookup using numpy searchsorted per entity."""
    if cumulative_df.empty or not cumulative_cols:
        return pd.DataFrame(index=base_df.index)

    temp = base_df[[base_entity_col, base_date_col]].copy()
    temp = temp.dropna(subset=[base_date_col]).copy()
    temp = temp.reset_index().rename(columns={'index': '__row_id'})
    temp['__entity_key'] = temp[base_entity_col].fillna(-1).astype(float)
    temp['__target_date'] = pd.to_datetime(temp[base_date_col]) - pd.to_timedelta(offset_days, unit='D')

    cumulative_sorted = cumulative_df.dropna(subset=[entity_col, date_col]).copy()
    cumulative_sorted['__entity_key'] = cumulative_sorted[entity_col].fillna(-1).astype(float)
    cumulative_sorted = cumulative_sorted.sort_values([ '__entity_key', date_col ])

    chunks = []
    for entity_key, group in temp.groupby('__entity_key'):
        left = group.sort_values('__target_date')
        right = cumulative_sorted[cumulative_sorted['__entity_key'] == entity_key]
        if right.empty:
            merged_recent = left[['__row_id']].copy()
            merged_window = left[['__row_id']].copy()
            for col in cumulative_cols:
                merged_recent[col] = 0.0
                merged_window[col] = 0.0
        else:
            right_dates = right[date_col].to_numpy()
            target_recent = left['__target_date'].to_numpy()
            idx_recent = np.searchsorted(right_dates, target_recent, side='right') - 1
            idx_recent[idx_recent < 0] = -1
            window_delta = np.timedelta64(window_days, 'D')
            target_window = target_recent - window_delta
            idx_window = np.searchsorted(right_dates, target_window, side='right') - 1
            idx_window[idx_window < 0] = -1
            merged_recent = left[['__row_id']].copy()
            merged_window = left[['__row_id']].copy()
            for col in cumulative_cols:
                values = right[col].to_numpy()
                merged_recent[col] = np.where(idx_recent >= 0, values[idx_recent], 0.0)
                merged_window[col] = np.where(idx_window >= 0, values[idx_window], 0.0)
        chunk = merged_recent[['__row_id']].copy()
        for col in cumulative_cols:
            metric_name = col.replace('cum_', '')
            chunk[f'{prefix}{metric_name}'] = merged_recent[col].astype(float) - merged_window[col].astype(float)
        chunks.append(chunk)

    features = pd.concat(chunks, ignore_index=True)
    return features.set_index('__row_id').sort_index()




def safe_per_90(numerator, minutes):
    """Return per-90 values that gracefully handle players with limited minutes."""
    numerator = numerator.astype(float)
    minutes = minutes.astype(float)
    with np.errstate(divide='ignore', invalid='ignore'):
        per90 = np.where(minutes > 0, numerator / minutes * 90.0, 0.0)
    return per90


def compute_linear_momentum(group, window_days):
    """Fit a simple slope of MV trend inside the specified window."""
    group = group.sort_values('snapshot_date')
    slopes = []
    for idx, row in group.iterrows():
        window_start = row['snapshot_date'] - pd.to_timedelta(window_days, unit='D')
        subset = group[
            (group['snapshot_date'] > window_start)
            & (group['snapshot_date'] <= row['snapshot_date'])
        ]
        if subset.shape[0] >= 2:
            x = (subset['snapshot_date'] - subset['snapshot_date'].min()).dt.days.astype(float)
            y = subset['current_market_value'].values
            slope = np.polyfit(x, y, 1)[0]
        else:
            slope = np.nan
        slopes.append(slope)
    return pd.Series(slopes, index=group.index)


In [6]:
# Load raw CSV files from the data directory.
players = pd.read_csv(
    DATA_DIR / 'players.csv',
    parse_dates=['date_of_birth', 'contract_expiration_date'],
)
valuations = pd.read_csv(
    DATA_DIR / 'player_valuations.csv',
    parse_dates=['date'],
)
appearances = pd.read_csv(
    DATA_DIR / 'appearances.csv',
    parse_dates=['date'],
)
games = pd.read_csv(
    DATA_DIR / 'games.csv',
    parse_dates=['date'],
)
club_games = pd.read_csv(DATA_DIR / 'club_games.csv')
clubs = pd.read_csv(DATA_DIR / 'clubs.csv')
competitions = pd.read_csv(DATA_DIR / 'competitions.csv')
game_events = pd.read_csv(
    DATA_DIR / 'game_events.csv',
    parse_dates=['date'],
)
game_lineups = pd.read_csv(
    DATA_DIR / 'game_lineups.csv',
    parse_dates=['date'],
)
transfers = pd.read_csv(
    DATA_DIR / 'transfers.csv',
    parse_dates=['transfer_date'],
)

print(f'Players: {players.shape}, Valuations: {valuations.shape}, Appearances: {appearances.shape}')


Players: (32601, 23), Valuations: (496606, 5), Appearances: (1706806, 13)


In [None]:
# Basic cleaning for numeric columns and helper references.
numeric_cols = ['goals', 'assists', 'minutes_played', 'yellow_cards', 'red_cards']
for col in numeric_cols:
    appearances[col] = pd.to_numeric(appearances[col], errors='coerce').fillna(0)
appearances['game_id'] = pd.to_numeric(appearances['game_id'], errors='coerce')

club_games = club_games.merge(games[['game_id', 'date']], on='game_id', how='left')
club_games['date'] = pd.to_datetime(club_games['date'])
club_games['own_goals'] = pd.to_numeric(club_games['own_goals'], errors='coerce').fillna(0)
club_games['opponent_goals'] = pd.to_numeric(club_games['opponent_goals'], errors='coerce').fillna(0)
club_games['is_win'] = pd.to_numeric(club_games['is_win'], errors='coerce').fillna(0).astype(int)

clubs['club_total_market_value'] = clubs['total_market_value'].apply(parse_euro_value)
clubs['club_net_transfer_record'] = clubs['net_transfer_record'].apply(parse_euro_value)
for col in ['squad_size', 'average_age', 'foreigners_percentage', 'national_team_players']:
    clubs[col] = pd.to_numeric(clubs[col], errors='coerce')

competitions['is_major_national_league'] = (
    competitions['is_major_national_league'].astype(str).str.lower() == 'true'
)

game_events['player_id'] = pd.to_numeric(game_events['player_id'], errors='coerce')
game_events['player_assist_id'] = pd.to_numeric(game_events['player_assist_id'], errors='coerce')

game_lineups['player_id'] = pd.to_numeric(game_lineups['player_id'], errors='coerce')

transfers['transfer_fee'] = pd.to_numeric(transfers['transfer_fee'], errors='coerce')
transfers['market_value_in_eur'] = pd.to_numeric(transfers['market_value_in_eur'], errors='coerce')
transfers['from_club_id'] = pd.to_numeric(transfers['from_club_id'], errors='coerce')
transfers['to_club_id'] = pd.to_numeric(transfers['to_club_id'], errors='coerce')


In [None]:
# Assemble the valuation-driven snapshot table.
snapshots = valuations.rename(
    columns={'date': 'snapshot_date', 'market_value_in_eur': 'current_market_value'}
).copy()
snapshots = snapshots.sort_values(['player_id', 'snapshot_date']).reset_index(drop=True)

player_cols = [
    'player_id',
    'name',
    'date_of_birth',
    'sub_position',
    'position',
    'foot',
    'height_in_cm',
    'contract_expiration_date',
    'country_of_birth',
    'country_of_citizenship',
    'current_club_name',
]
snapshots = snapshots.merge(players[player_cols], on='player_id', how='left')
snapshots['player_name'] = snapshots['name']
snapshots['age'] = (
    (snapshots['snapshot_date'] - snapshots['date_of_birth']).dt.days / 365.25
)
snapshots['years_to_contract_end'] = (
    (snapshots['contract_expiration_date'] - snapshots['snapshot_date']).dt.days / 365.25
)
snapshots['season'] = snapshots['snapshot_date'].dt.year

club_cols = [
    'club_id',
    'club_total_market_value',
    'squad_size',
    'average_age',
    'foreigners_percentage',
    'national_team_players',
]
snapshots = snapshots.merge(
    clubs[club_cols],
    left_on='current_club_id',
    right_on='club_id',
    how='left',
).drop(columns=['club_id'])

competition_cols = ['competition_id', 'name', 'type', 'country_name', 'is_major_national_league']
snapshots = snapshots.merge(
    competitions[competition_cols],
    left_on='player_club_domestic_competition_id',
    right_on='competition_id',
    how='left',
)
snapshots = snapshots.rename(
    columns={
        'name': 'competition_name',
        'type': 'competition_type',
        'country_name': 'competition_country',
    }
)

top5_codes = {'ES1', 'GB1', 'IT1', 'L1', 'FR1'}
snapshots['is_top5_league'] = snapshots['player_club_domestic_competition_id'].isin(top5_codes).astype(int)
snapshots['league_strength'] = np.select(
    [snapshots['is_top5_league'] == 1, snapshots['is_major_national_league'] == True],
    [3, 2],
    default=1,
)

snapshots['highest_market_value_to_date'] = snapshots.groupby('player_id')['current_market_value'].cummax()
snapshots['mv_ratio_to_peak'] = np.where(
    snapshots['highest_market_value_to_date'] > 0,
    snapshots['current_market_value'] / snapshots['highest_market_value_to_date'],
    np.nan,
)

rolling_counts = (
    snapshots.set_index('snapshot_date')
    .groupby('player_id')['current_market_value']
    .rolling('365D')
    .count()
    .reset_index(level=0, drop=True)
)
snapshots['num_valuations_365'] = rolling_counts.values

for window_days, col_name in [(180, 'mv_momentum_6m'), (365, 'mv_momentum_12m')]:
    snapshots[col_name] = (
        snapshots.groupby('player_id', group_keys=False)
        .apply(lambda grp: compute_linear_momentum(grp, window_days))
        .values
    )

snapshots.head()


In [None]:
# Performance features from appearances (365d + previous 365d).
appearance_metrics = appearances[[
    'player_id',
    'date',
    'minutes_played',
    'goals',
    'assists',
    'yellow_cards',
    'red_cards',
    'game_id',
]].dropna(subset=['player_id'])
appearance_aggs = {
    'minutes_total': ('minutes_played', 'sum'),
    'goals_total': ('goals', 'sum'),
    'assists_total': ('assists', 'sum'),
    'yellow_cards_total': ('yellow_cards', 'sum'),
    'red_cards_total': ('red_cards', 'sum'),
    'games_played': ('game_id', 'count'),
}
appearance_cum, appearance_cols = compute_cumulative_table(
    appearance_metrics,
    'player_id',
    'date',
    appearance_aggs,
)
last365 = window_aggregate(
    snapshots,
    appearance_cum,
    'player_id',
    'snapshot_date',
    'player_id',
    'date',
    appearance_cols,
    365,
    'last365_',
)
prev365 = window_aggregate(
    snapshots,
    appearance_cum,
    'player_id',
    'snapshot_date',
    'player_id',
    'date',
    appearance_cols,
    365,
    'prev365_',
    offset_days=365,
)
snapshots = snapshots.join(last365).join(prev365)

rename_map = {
    'last365_minutes_total': 'minutes_total_365',
    'last365_goals_total': 'goals_total_365',
    'last365_assists_total': 'assists_total_365',
    'last365_yellow_cards_total': 'yellow_cards_total_365',
    'last365_red_cards_total': 'red_cards_total_365',
    'last365_games_played': 'games_played_365',
    'prev365_minutes_total': 'prev_minutes_total_365',
    'prev365_goals_total': 'prev_goals_total_365',
    'prev365_assists_total': 'prev_assists_total_365',
    'prev365_yellow_cards_total': 'prev_yellow_cards_total_365',
    'prev365_red_cards_total': 'prev_red_cards_total_365',
    'prev365_games_played': 'prev_games_played_365',
}
snapshots = snapshots.rename(columns=rename_map)

snapshots['minutes_per_game_365'] = np.where(
    snapshots['games_played_365'] > 0,
    snapshots['minutes_total_365'] / snapshots['games_played_365'],
    np.nan,
)
snapshots['goals_per_90_365'] = safe_per_90(
    snapshots['goals_total_365'], snapshots['minutes_total_365']
)
snapshots['assists_per_90_365'] = safe_per_90(
    snapshots['assists_total_365'], snapshots['minutes_total_365']
)
snapshots['goal_contributions_per_90_365'] = safe_per_90(
    snapshots['goals_total_365'] + snapshots['assists_total_365'],
    snapshots['minutes_total_365'],
)
snapshots['discipline_cards_per_90_365'] = safe_per_90(
    snapshots['yellow_cards_total_365'] + snapshots['red_cards_total_365'],
    snapshots['minutes_total_365'],
)
snapshots['prev_goals_per_90_365'] = safe_per_90(
    snapshots['prev_goals_total_365'], snapshots['prev_minutes_total_365']
)
snapshots['prev_assists_per_90_365'] = safe_per_90(
    snapshots['prev_assists_total_365'], snapshots['prev_minutes_total_365']
)
snapshots['delta_minutes_total'] = snapshots['minutes_total_365'] - snapshots['prev_minutes_total_365']
snapshots['delta_goals_per_90'] = snapshots['goals_per_90_365'] - snapshots['prev_goals_per_90_365']
snapshots['delta_assists_per_90'] = snapshots['assists_per_90_365'] - snapshots['prev_assists_per_90_365']


In [None]:
# Availability & starter rate features using game_lineups.
lineups = game_lineups[['player_id', 'date', 'type']].dropna(subset=['player_id']).copy()
lineups['is_start'] = (lineups['type'] == 'starting_lineup').astype(int)
lineups['in_match_squad'] = 1
lineup_aggs = {
    'starts_total': ('is_start', 'sum'),
    'squad_inclusions': ('in_match_squad', 'sum'),
}
lineup_cum, lineup_cols = compute_cumulative_table(
    lineups,
    'player_id',
    'date',
    lineup_aggs,
)
lineup_window = window_aggregate(
    snapshots,
    lineup_cum,
    'player_id',
    'snapshot_date',
    'player_id',
    'date',
    lineup_cols,
    365,
    'lineup365_',
)
snapshots = snapshots.join(lineup_window)
snapshots = snapshots.rename(
    columns={
        'lineup365_starts_total': 'starts_365',
        'lineup365_squad_inclusions': 'squad_inclusions_365',
    }
)
snapshots['starter_rate_365'] = np.where(
    snapshots['squad_inclusions_365'] > 0,
    snapshots['starts_365'] / snapshots['squad_inclusions_365'],
    np.nan,
)


In [None]:
# Event-level context: goals, cards, substitutions.
events = game_events.dropna(subset=['player_id', 'date']).copy()
events['is_goal_event'] = (events['type'] == 'Goals').astype(int)
events['is_card_event'] = (events['type'] == 'Cards').astype(int)
events['is_sub_event'] = (events['type'] == 'Substitutions').astype(int)
events['is_assist_event'] = (events['player_assist_id'] == events['player_id']).astype(int)
event_aggs = {
    'goal_events': ('is_goal_event', 'sum'),
    'card_events': ('is_card_event', 'sum'),
    'substitution_events': ('is_sub_event', 'sum'),
    'assist_events': ('is_assist_event', 'sum'),
}
event_cum, event_cols = compute_cumulative_table(
    events,
    'player_id',
    'date',
    event_aggs,
)
event_window = window_aggregate(
    snapshots,
    event_cum,
    'player_id',
    'snapshot_date',
    'player_id',
    'date',
    event_cols,
    365,
    'events365_',
)
snapshots = snapshots.join(event_window)
snapshots = snapshots.rename(
    columns={
        'events365_goal_events': 'goal_events_365',
        'events365_card_events': 'card_events_365',
        'events365_substitution_events': 'substitution_events_365',
        'events365_assist_events': 'assist_events_365',
    }
)


In [None]:
# Club form context (wins, goal difference, etc.).
club_games_subset = club_games[[
    'club_id',
    'date',
    'game_id',
    'is_win',
    'own_goals',
    'opponent_goals',
]].dropna(subset=['club_id', 'date']).copy()
club_aggs = {
    'club_games_total': ('game_id', 'count'),
    'club_wins_total': ('is_win', 'sum'),
    'club_goals_for_total': ('own_goals', 'sum'),
    'club_goals_against_total': ('opponent_goals', 'sum'),
}
club_cum, club_cols = compute_cumulative_table(
    club_games_subset,
    'club_id',
    'date',
    club_aggs,
)
club_window = window_aggregate(
    snapshots,
    club_cum,
    'current_club_id',
    'snapshot_date',
    'club_id',
    'date',
    club_cols,
    365,
    'club365_',
)
snapshots = snapshots.join(club_window)
snapshots = snapshots.rename(
    columns={
        'club365_club_games_total': 'club_games_365',
        'club365_club_wins_total': 'club_wins_365',
        'club365_club_goals_for_total': 'club_goals_for_365',
        'club365_club_goals_against_total': 'club_goals_against_365',
    }
)
snapshots['club_win_rate_365'] = np.where(
    snapshots['club_games_365'] > 0,
    snapshots['club_wins_365'] / snapshots['club_games_365'],
    np.nan,
)
snapshots['club_goal_diff_365'] = snapshots['club_goals_for_365'] - snapshots['club_goals_against_365']


In [None]:
# Transfer history signals (recent transfer, fee delta, etc.).
transfer_context = transfers.dropna(subset=['player_id', 'transfer_date']).copy()
transfer_context = transfer_context.sort_values(['player_id', 'transfer_date'])
transfer_context = transfer_context.merge(
    clubs[['club_id', 'club_total_market_value']].rename(
        columns={'club_id': 'from_club_id', 'club_total_market_value': 'from_club_value'}
    ),
    on='from_club_id',
    how='left',
)
transfer_context = transfer_context.merge(
    clubs[['club_id', 'club_total_market_value']].rename(
        columns={'club_id': 'to_club_id', 'club_total_market_value': 'to_club_value'}
    ),
    on='to_club_id',
    how='left',
)
transfer_subset = transfer_context[[
    'player_id',
    'transfer_date',
    'transfer_fee',
    'market_value_in_eur',
    'from_club_value',
    'to_club_value',
]]

snapshots_sorted = snapshots.sort_values(['player_id', 'snapshot_date']).reset_index()
transfer_chunks = []
for player_id, snap_group in snapshots_sorted.groupby('player_id'):
    chunk = snap_group[['index', 'snapshot_date']].copy()
    player_transfers = transfer_subset[transfer_subset['player_id'] == player_id]
    if player_transfers.empty:
        chunk['transfer_date'] = pd.NaT
        chunk['transfer_fee'] = np.nan
        chunk['market_value_in_eur'] = np.nan
        chunk['from_club_value'] = np.nan
        chunk['to_club_value'] = np.nan
    else:
        trans_dates = player_transfers['transfer_date'].to_numpy()
        snap_dates = snap_group['snapshot_date'].to_numpy()
        idx = np.searchsorted(trans_dates, snap_dates, side='right') - 1
        mask = idx >= 0
        idx_safe = idx.copy()
        idx_safe[~mask] = 0
        values_date = player_transfers['transfer_date'].to_numpy()
        values_fee = player_transfers['transfer_fee'].to_numpy()
        values_mv = player_transfers['market_value_in_eur'].to_numpy()
        values_from = player_transfers['from_club_value'].to_numpy()
        values_to = player_transfers['to_club_value'].to_numpy()
        chunk['transfer_date'] = pd.to_datetime(
            np.where(mask, values_date[idx_safe], np.datetime64('NaT'))
        )
        chunk['transfer_fee'] = np.where(mask, values_fee[idx_safe], np.nan)
        chunk['market_value_in_eur'] = np.where(mask, values_mv[idx_safe], np.nan)
        chunk['from_club_value'] = np.where(mask, values_from[idx_safe], np.nan)
        chunk['to_club_value'] = np.where(mask, values_to[idx_safe], np.nan)
    transfer_chunks.append(chunk)

transfer_features = pd.concat(transfer_chunks, ignore_index=True)
transfer_features['days_since_last_transfer'] = (
    transfer_features['snapshot_date'] - transfer_features['transfer_date']
).dt.days
transfer_features['has_recent_transfer'] = transfer_features['days_since_last_transfer'].between(0, 365, inclusive='both')
transfer_features['moved_to_bigger_club'] = np.where(
    transfer_features['has_recent_transfer']
    & (transfer_features['to_club_value'] > transfer_features['from_club_value']),
    1,
    0,
)
transfer_features['transfer_fee_vs_mv'] = (
    transfer_features['transfer_fee'] - transfer_features['market_value_in_eur']
)
transfer_df = transfer_features.set_index('index')[[
    'has_recent_transfer',
    'moved_to_bigger_club',
    'transfer_fee_vs_mv',
    'days_since_last_transfer',
]]
snapshots = snapshots.join(transfer_df, how='left')
snapshots['has_recent_transfer'] = snapshots['has_recent_transfer'].fillna(0).astype(int)
snapshots['moved_to_bigger_club'] = snapshots['moved_to_bigger_club'].fillna(0).astype(int)


In [None]:
# Final feature table export.
feature_columns = [
    'player_id',
    'player_name',
    'snapshot_date',
    'season',
    'age',
    'position',
    'sub_position',
    'foot',
    'height_in_cm',
    'country_of_birth',
    'country_of_citizenship',
    'current_club_id',
    'current_club_name',
    'player_club_domestic_competition_id',
    'competition_name',
    'competition_country',
    'competition_type',
    'league_strength',
    'is_top5_league',
    'club_total_market_value',
    'squad_size',
    'average_age',
    'foreigners_percentage',
    'national_team_players',
    'current_market_value',
    'highest_market_value_to_date',
    'mv_ratio_to_peak',
    'mv_momentum_6m',
    'mv_momentum_12m',
    'num_valuations_365',
    'years_to_contract_end',
    'minutes_total_365',
    'games_played_365',
    'goals_total_365',
    'assists_total_365',
    'yellow_cards_total_365',
    'red_cards_total_365',
    'minutes_per_game_365',
    'goals_per_90_365',
    'assists_per_90_365',
    'goal_contributions_per_90_365',
    'discipline_cards_per_90_365',
    'prev_minutes_total_365',
    'prev_goals_total_365',
    'prev_assists_total_365',
    'prev_goals_per_90_365',
    'prev_assists_per_90_365',
    'delta_minutes_total',
    'delta_goals_per_90',
    'delta_assists_per_90',
    'starts_365',
    'squad_inclusions_365',
    'starter_rate_365',
    'goal_events_365',
    'card_events_365',
    'substitution_events_365',
    'assist_events_365',
    'club_games_365',
    'club_wins_365',
    'club_goals_for_365',
    'club_goals_against_365',
    'club_win_rate_365',
    'club_goal_diff_365',
    'has_recent_transfer',
    'moved_to_bigger_club',
    'transfer_fee_vs_mv',
    'days_since_last_transfer',
]
final_df = snapshots[feature_columns].sort_values(['snapshot_date', 'player_id']).reset_index(drop=True)
final_df.to_csv(OUTPUT_PATH, index=False)
print(f'Saved {len(final_df):,} rows with {final_df.shape[1]} columns to {OUTPUT_PATH}')
final_df.head()
