My Work:
1 I use a easy trans model to replace the gru model
2 I use a new loss function with a speed limitation
3 I use a TTA in best seed model and multi-seed SWA
4 I use a team-base feature to enhance model performing

STEP 2 — Better Features Pipeline (cuDF-ready)
- Correct kinematics & angles
- Unify play direction (and invert at submission time)
- Fast, modular feature engineering (works with pandas or cuDF pandas-API)
- Same GRU architecture + GroupKFold CV
- Safe targets (dx, dy) built in the unified coordinate frame

In [None]:
# -------------------------------
# Global imports + cuDF accelerator
# -------------------------------
import os
USE_CUDF = False
try:
    # zero/low-code GPU acceleration for DataFrame ops
    os.environ["CUDF_PANDAS_BACKEND"] = "cudf"
    import pandas as pd
    import numpy as np
    import cupy as cp  # optional (not strictly required below)
    USE_CUDF = True
    print("using cuda_backend pandas for faster parallel data processing")
except Exception:
    print("cuda df not used")
    import pandas as pd
    import numpy as np

import torch
import torch.nn as nn
from pathlib import Path
from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GroupKFold
import warnings
warnings.filterwarnings("ignore")

# -------------------------------
# Constants & helpers
# -------------------------------
YARDS_TO_METERS = 0.9144
FPS = 10.0 
FIELD_LENGTH, FIELD_WIDTH = 120.0, 53.3

def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
print("environment set up!")
def wrap_angle_deg(s):
    # map to (-180, 180]
    return ((s + 180.0) % 360.0) - 180.0

def unify_left_direction(df: pd.DataFrame) -> pd.DataFrame:
    """Mirror rightward plays so all samples are 'left' oriented (x,y, dir, o, ball_land)."""
    if 'play_direction' not in df.columns:
        return df
    df = df.copy()
    right = df['play_direction'].eq('right')
    # positions
    if 'x' in df.columns: df.loc[right, 'x'] = FIELD_LENGTH - df.loc[right, 'x']
    if 'y' in df.columns: df.loc[right, 'y'] = FIELD_WIDTH  - df.loc[right, 'y']
    # angles in degrees
    for col in ('dir','o'):
        if col in df.columns:
            df.loc[right, col] = (df.loc[right, col] + 180.0) % 360.0
    # ball landing
    if 'ball_land_x' in df.columns:
        df.loc[right, 'ball_land_x'] = FIELD_LENGTH - df.loc[right, 'ball_land_x']
    if 'ball_land_y' in df.columns:
        df.loc[right, 'ball_land_y'] = FIELD_WIDTH  - df.loc[right, 'ball_land_y']
    return df

def invert_to_original_direction(x_u, y_u, play_dir_right: bool):
    """Invert unified (left) coordinates back to original play direction."""
    if not play_dir_right:
        return float(x_u), float(y_u)
    return float(FIELD_LENGTH - x_u), float(FIELD_WIDTH - y_u)

# -------------------------------
# Config
# -------------------------------
class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    OUTPUT_DIR = Path("./outputs"); OUTPUT_DIR.mkdir(exist_ok=True)

    SEED = 42
    N_FOLDS = 5
    BATCH_SIZE = 256
    EPOCHS = 200
    PATIENCE = 30
    LEARNING_RATE = 1e-3

    WINDOW_SIZE = 10
    HIDDEN_DIM = 128
    MAX_FUTURE_HORIZON = 94  # 不要改动这个！！！

    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

set_seed(Config.SEED)

In [None]:
# -------------------------------
# Sequence builder (unified frame + safe targets)
# -------------------------------
def build_play_direction_map(df_in: pd.DataFrame) -> pd.Series:
    """
    Return a Series indexed by (game_id, play_id) with values 'left'/'right'.
    This keeps a clean MultiIndex that works for both pandas and cuDF pandas-API.
    """
    s = (
        df_in[['game_id','play_id','play_direction']]
        .drop_duplicates()
        .set_index(['game_id','play_id'])['play_direction']
    )
    return s  # MultiIndex Series


def apply_direction_to_df(df: pd.DataFrame, dir_map: pd.Series) -> pd.DataFrame:
    """
    Attach play_direction (if missing) and then unify to 'left'.
    dir_map must be the MultiIndex Series produced by build_play_direction_map.
    """
    if 'play_direction' not in df.columns:
        dir_df = dir_map.reset_index()  # -> columns: game_id, play_id, play_direction
        df = df.merge(dir_df, on=['game_id','play_id'], how='left', validate='many_to_one')
    return unify_left_direction(df)

def prepare_sequences_with_advanced_features(
        input_df, output_df=None, test_template=None, 
        is_training=True, window_size=10, feature_groups=None):

    print(f"\n{'='*80}")
    print(f"PREPARING SEQUENCES WITH ADVANCED FEATURES (UNIFIED FRAME)")
    print(f"{'='*80}")
    print(f"Window size: {window_size}")

    if feature_groups is None:
        feature_groups = [
            'distance_rate','target_alignment','multi_window_rolling','extended_lags',
            'velocity_changes','field_position','role_specific','time_features','jerk_features',
            'player_interaction_distance',#"curvature_land_features"
        ]

    # Direction map and unify
    # inside prepare_sequences_with_advanced_features(...)
    dir_map = build_play_direction_map(input_df)
    input_df_u = unify_left_direction(input_df)
    
    if is_training:
        out_u = apply_direction_to_df(output_df, dir_map)  # <-- 用新的函数
        target_rows = out_u
        target_groups = out_u[['game_id','play_id','nfl_id']].drop_duplicates()
    else:
        # ensure test_template has play_direction via safe merge
        if 'play_direction' not in test_template.columns:
            dir_df = dir_map.reset_index()
            test_template = test_template.merge(dir_df, on=['game_id','play_id'], how='left', validate='many_to_one')
        target_rows = test_template
        target_groups = target_rows[['game_id','play_id','nfl_id','play_direction']].drop_duplicates()
        
    #after merging play_direction into outputs / test_template:
    assert target_rows[['game_id','play_id','play_direction']].isna().sum().sum() == 0, \
        "play_direction merge failed; check (game_id, play_id) coverage"
    print("play_direction merge OK:", target_rows['play_direction'].value_counts(dropna=False).to_dict())
    # --- FE ---

    fe = FeatureEngineer(feature_groups)
    processed_df, feature_cols = fe.transform(input_df_u)

    # --- Build sequences ---
    print("\nStep 3/3: Creating sequences...")
    processed_df = processed_df.set_index(['game_id','play_id','nfl_id']).sort_index()
    grouped = processed_df.groupby(level=['game_id','play_id','nfl_id'])

    # helpful indices for last x,y in unified frame
    idx_x = feature_cols.index('x')
    idx_y = feature_cols.index('y')

    sequences, targets_dx, targets_dy, targets_fids, seq_meta = [], [], [], [], []

    it = target_groups.itertuples(index=False)
    it = tqdm(list(it), total=len(target_groups), desc="Creating sequences")

    for row in it:
        gid = row[0]; pid = row[1]; nid = row[2]
        play_dir = row[3] if (not is_training and len(row) >= 4) else None
        key = (gid, pid, nid)

        try:
            group_df = grouped.get_group(key)
        except KeyError:
            continue

        input_window = group_df.tail(window_size)
        if len(input_window) < window_size:
            if is_training:
                continue
            pad_len = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_len), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)

        # simple impute with group means
        input_window = input_window.fillna(group_df.mean(numeric_only=True))
        seq = input_window[feature_cols].values

        if np.isnan(seq).any():
            if is_training:
                continue
            seq = np.nan_to_num(seq, nan=0.0)

        sequences.append(seq)

        # training targets from unified outputs (dx, dy from last unified x,y)
        if is_training:
            out_grp = target_rows[
                (target_rows['game_id']==gid) &
                (target_rows['play_id']==pid) &
                (target_rows['nfl_id']==nid)
            ].sort_values('frame_id')
            if len(out_grp)==0:
                continue

            last_x = seq[-1, idx_x]
            last_y = seq[-1, idx_y]
            dx = out_grp['x'].values - last_x
            dy = out_grp['y'].values - last_y

            targets_dx.append(dx.astype(np.float32))
            targets_dy.append(dy.astype(np.float32))
            targets_fids.append(out_grp['frame_id'].values.astype(np.int32))

        seq_meta.append({
            'game_id': gid,
            'play_id': pid,
            'nfl_id': nid,
            'frame_id': int(input_window.iloc[-1]['frame_id']),
            'play_direction': (None if is_training else play_dir),
        })

    print(f"Created {len(sequences)} sequences with {len(feature_cols)} features each")

    if is_training:
        return sequences, targets_dx, targets_dy, targets_fids, seq_meta, feature_cols, dir_map
    return sequences, seq_meta, feature_cols, dir_map

In [None]:
# -------------------------------
# Feature Engineering
# -------------------------------
class FeatureEngineer:
    """
    Modular, ablation-friendly feature builder (pandas or cuDF pandas-API).
    """
    def __init__(self, feature_groups_to_create):
        self.gcols = ['game_id', 'play_id', 'nfl_id']
        self.active_groups = feature_groups_to_create
        self.feature_creators = {
            'distance_rate': self._create_distance_rate_features,
            'target_alignment': self._create_target_alignment_features,
            'multi_window_rolling': self._create_multi_window_rolling_features,
            'extended_lags': self._create_extended_lag_features,
            'velocity_changes': self._create_velocity_change_features,
            'field_position': self._create_field_position_features,
            'role_specific': self._create_role_specific_features,
            'time_features': self._create_time_features,
            'jerk_features': self._create_jerk_features,
            'curvature_land_features': self._create_curvature_land_features,
            'player_interaction_distance': self._create_player_interaction_distance_features,
        }
        self.created_feature_cols = []

    def _height_to_feet(self, height_str):
        try:
            ft, inches = map(int, str(height_str).split('-'))
            return ft + inches / 12
        except Exception:
            return 6.0

    def _create_basic_features(self, df):
        print("Step 1/3: Adding basic features...")
        df = df.copy()
        df['player_height_feet'] = df['player_height'].apply(self._height_to_feet)

        # Correct kinematics: dir is from +x CCW
        dir_rad = np.deg2rad(df['dir'].fillna(0.0).astype('float32'))
        df['velocity_x']     = df['s'] * np.cos(dir_rad)
        df['velocity_y']     = df['s'] * np.sin(dir_rad)
        df['acceleration_x'] = df['a'] * np.cos(dir_rad)
        df['acceleration_y'] = df['a'] * np.sin(dir_rad)

        # Roles
        df['is_offense']  = (df['player_side'] == 'Offense').astype(np.int8)
        df['is_defense']  = (df['player_side'] == 'Defense').astype(np.int8)
        df['is_receiver'] = (df['player_role'] == 'Targeted Receiver').astype(np.int8)
        df['is_coverage'] = (df['player_role'] == 'Defensive Coverage').astype(np.int8)
        df['is_passer']   = (df['player_role'] == 'Passer').astype(np.int8)

        # Energetics (consistent units)
        mass_kg = df['player_weight'].fillna(200.0) / 2.20462
        v_ms = df['s'] * YARDS_TO_METERS
        df['momentum_x'] = mass_kg * df['velocity_x'] * YARDS_TO_METERS
        df['momentum_y'] = mass_kg * df['velocity_y'] * YARDS_TO_METERS
        df['kinetic_energy'] = 0.5 * mass_kg * (v_ms ** 2)

        # Ball landing geometry (static)
        if {'ball_land_x','ball_land_y'}.issubset(df.columns):
            ball_dx = df['ball_land_x'] - df['x']
            ball_dy = df['ball_land_y'] - df['y']
            dist = np.hypot(ball_dx, ball_dy)
            df['distance_to_ball'] = dist
            inv = 1.0 / (dist + 1e-6)
            df['ball_direction_x'] = ball_dx * inv
            df['ball_direction_y'] = ball_dy * inv
            df['closing_speed'] = (
                df['velocity_x'] * df['ball_direction_x'] +
                df['velocity_y'] * df['ball_direction_y']
            )

        base = [
            'x','y','s','a','o','dir','frame_id','ball_land_x','ball_land_y',
            'player_height_feet','player_weight',
            'velocity_x','velocity_y','acceleration_x','acceleration_y',
            'momentum_x','momentum_y','kinetic_energy',
            'is_offense','is_defense','is_receiver','is_coverage','is_passer',
            'distance_to_ball','ball_direction_x','ball_direction_y','closing_speed'
        ]
        self.created_feature_cols.extend([c for c in base if c in df.columns])
        return df

    # ---- feature groups ----
    def _create_distance_rate_features(self, df):
        new_cols = []
        if 'distance_to_ball' in df.columns:
            d = df.groupby(self.gcols)['distance_to_ball'].diff()
            df['d2ball_dt']  = d.fillna(0.0) * FPS
            df['d2ball_ddt'] = df.groupby(self.gcols)['d2ball_dt'].diff().fillna(0.0) * FPS
            df['time_to_intercept'] = (df['distance_to_ball'] /
                                       (df['d2ball_dt'].abs() + 1e-3)).clip(0, 10)
            new_cols = ['d2ball_dt','d2ball_ddt','time_to_intercept']
        return df, new_cols

    def _create_target_alignment_features(self, df):
        new_cols = []
        if {'ball_direction_x','ball_direction_y','velocity_x','velocity_y'}.issubset(df.columns):
            df['velocity_alignment'] = df['velocity_x']*df['ball_direction_x'] + df['velocity_y']*df['ball_direction_y']
            df['velocity_perpendicular'] = df['velocity_x']*(-df['ball_direction_y']) + df['velocity_y']*df['ball_direction_x']
            new_cols.extend(['velocity_alignment','velocity_perpendicular'])
            if {'acceleration_x','acceleration_y'}.issubset(df.columns):
                df['accel_alignment'] = df['acceleration_x']*df['ball_direction_x'] + df['acceleration_y']*df['ball_direction_y']
                new_cols.append('accel_alignment')
        return df, new_cols

    def _create_multi_window_rolling_features(self, df):
        # keep it simple & compatible (works with cuDF pandas-API); vectorized rolling per group
        new_cols = []
        for window in (3, 5, 10):
            for col in ('velocity_x','velocity_y','s','a'):
                if col in df.columns:
                    r_mean = df.groupby(self.gcols)[col].rolling(window, min_periods=1).mean()
                    r_std  = df.groupby(self.gcols)[col].rolling(window, min_periods=1).std()
                    # align indices
                    r_mean = r_mean.reset_index(level=list(range(len(self.gcols))), drop=True)
                    r_std  = r_std.reset_index(level=list(range(len(self.gcols))), drop=True)
                    df[f'{col}_roll{window}'] = r_mean
                    df[f'{col}_std{window}']  = r_std.fillna(0.0)
                    new_cols.extend([f'{col}_roll{window}', f'{col}_std{window}'])
        return df, new_cols

    def _create_extended_lag_features(self, df):
        new_cols = []
        for lag in (1,2,3,4,5):
            for col in ('x','y','velocity_x','velocity_y'):
                if col in df.columns:
                    g = df.groupby(self.gcols)[col]
                    lagv = g.shift(lag)
                    # safe fill for first frames (no "future" leakage)
                    df[f'{col}_lag{lag}'] = lagv.fillna(g.transform('first'))
                    new_cols.append(f'{col}_lag{lag}')
        return df, new_cols

    def _create_velocity_change_features(self, df):
        new_cols = []
        if 'velocity_x' in df.columns:
            df['velocity_x_change'] = df.groupby(self.gcols)['velocity_x'].diff().fillna(0.0)
            df['velocity_y_change'] = df.groupby(self.gcols)['velocity_y'].diff().fillna(0.0)
            df['speed_change']      = df.groupby(self.gcols)['s'].diff().fillna(0.0)
            d = df.groupby(self.gcols)['dir'].diff().fillna(0.0)
            df['direction_change']  = wrap_angle_deg(d)
            new_cols = ['velocity_x_change','velocity_y_change','speed_change','direction_change']
        return df, new_cols

    def _create_field_position_features(self, df):
        df['dist_from_left'] = df['y']
        df['dist_from_right'] = FIELD_WIDTH - df['y']
        df['dist_from_sideline'] = np.minimum(df['dist_from_left'], df['dist_from_right'])
        df['dist_from_endzone']  = np.minimum(df['x'], FIELD_LENGTH - df['x'])
        return df, ['dist_from_sideline','dist_from_endzone']

    def _create_role_specific_features(self, df):
        new_cols = []
        if {'is_receiver','velocity_alignment'}.issubset(df.columns):
            df['receiver_optimality'] = df['is_receiver'] * df['velocity_alignment']
            df['receiver_deviation']  = df['is_receiver'] * np.abs(df.get('velocity_perpendicular', 0.0))
            new_cols.extend(['receiver_optimality','receiver_deviation'])
        if {'is_coverage','closing_speed'}.issubset(df.columns):
            df['defender_closing_speed'] = df['is_coverage'] * df['closing_speed']
            new_cols.append('defender_closing_speed')
        return df, new_cols

    def _create_time_features(self, df):
        df['frames_elapsed']  = df.groupby(self.gcols).cumcount()
        df['normalized_time'] = df.groupby(self.gcols)['frames_elapsed'].transform(
            lambda x: x / (x.max() + 1e-9)
        )
        return df, ['frames_elapsed','normalized_time']

    def _create_jerk_features(self, df):
        new_cols = []
        if 'a' in df.columns:
            df['jerk'] = df.groupby(self.gcols)['a'].diff().fillna(0.0) * FPS
            new_cols.append('jerk')
        if {'acceleration_x','acceleration_y'}.issubset(df.columns):
            df['jerk_x'] = df.groupby(self.gcols)['acceleration_x'].diff().fillna(0.0) * FPS
            df['jerk_y'] = df.groupby(self.gcols)['acceleration_y'].diff().fillna(0.0) * FPS
            new_cols.extend(['jerk_x','jerk_y'])
        return df, new_cols
    def _create_curvature_land_features(self, df):
        """
        -落点侧向偏差（符号）：landing_point 相对“当前运动方向”的左右偏离
          lateral = cross(u_dir, vector_to_land)（>0 表示落点在运动方向左侧）
        -bearing_to_land_signed: 运动方向 vs 落点方位角
        -速度归一化曲率： wrap(Δdir)/ (s*Δt) ，窗口化(3/5) 的均值/绝对值
        """
        import numpy as np
        # 侧向偏差 & bearing_to_land
        if {'ball_land_x','ball_land_y'}.issubset(df.columns):
            dx = df['ball_land_x'] - df['x']
            dy = df['ball_land_y'] - df['y']
            bearing = np.arctan2(dy, dx)
            a_dir = np.deg2rad(df['dir'].fillna(0.0).values)
            # 有符号方位差
            df['bearing_to_land_signed'] = np.rad2deg(np.arctan2(np.sin(bearing - a_dir), np.cos(bearing - a_dir)))
            # 侧向偏差：d × u (2D cross, z 分量)
            ux, uy = np.cos(a_dir), np.sin(a_dir)
            df['land_lateral_offset'] = dy*ux - dx*uy  # >0 落点在左侧
    
        # 曲率（按序列）
        ddir = df.groupby(self.gcols)['dir'].diff().fillna(0.0)
        ddir = ((ddir + 180.0) % 360.0) - 180.0
        curvature = np.deg2rad(ddir).astype('float32') / (df['s'].replace(0, np.nan).astype('float32') * 0.1 + 1e-6)
        df['curvature_signed'] = curvature.fillna(0.0)
        df['curvature_abs'] = df['curvature_signed'].abs()
    
        # 窗口均值（3/5）
        for w in (3,5):
            r = df.groupby(self.gcols)['curvature_signed'].rolling(w, min_periods=1).mean().reset_index(level=[0,1,2], drop=True)
            df[f'curv_signed_roll{w}'] = r
            r2 = df.groupby(self.gcols)['curvature_abs'].rolling(w, min_periods=1).mean().reset_index(level=[0,1,2], drop=True)
            df[f'curv_abs_roll{w}'] = r2
    
        new_cols = ['bearing_to_land_signed','land_lateral_offset',
                    'curvature_signed','curvature_abs','curv_signed_roll3','curv_abs_roll3',
                    'curv_signed_roll5','curv_abs_roll5']
        return df, [c for c in new_cols if c in df.columns]
        
    def _create_player_interaction_distance_features(self, df):
        new_cols = []
        
        if not {'x', 'y', 'velocity_x', 'velocity_y', 'is_offense'}.issubset(df.columns):
            return df, new_cols
        
        # 按play分组，确保时间连续性
        all_frames = []
        
        for (gid, pid), play_df in df.groupby(['game_id', 'play_id']):
            play_df = play_df.sort_values('frame_id').copy()
            
            # 每帧的交互特征
            frame_features = []
            
            for fid, frame_df in play_df.groupby('frame_id', sort=True):
                frame_df = frame_df.copy()
                positions = frame_df[['x', 'y']].values
                velocities = frame_df[['velocity_x', 'velocity_y']].values
                is_offense = frame_df['is_offense'].values
                nfl_ids = frame_df['nfl_id'].values  # 用于跟踪球员
                
                n_players = len(frame_df)
                
                # === 原有特征 ===
                min_opponent_dist = np.full(n_players, 999.0, dtype=np.float32)
                min_teammate_dist = np.full(n_players, 999.0, dtype=np.float32)
                relative_velocity_to_opponent = np.zeros(n_players, dtype=np.float32)
                opponent_density = np.zeros(n_players, dtype=np.int8)
                teammate_density = np.zeros(n_players, dtype=np.int8)
                
                # === 新增特征 ===
                time_to_closest_opponent = np.full(n_players, 999.0, dtype=np.float32)  # TTC
                nearest_opponent_acceleration = np.zeros(n_players, dtype=np.float32)
                directional_threat_score = np.zeros(n_players, dtype=np.float32)  # 方向威胁
                voronoi_area = np.full(n_players, 100.0, dtype=np.float32)  # 控制区域
                
                # 计算交互特征
                for i in range(n_players):
                    pos_i = positions[i]
                    vel_i = velocities[i]
                    side_i = is_offense[i]
                    speed_i = np.linalg.norm(vel_i)
                    
                    opponent_dists = []
                    teammate_dists = []
                    closest_opponent_idx = -1
                    min_opp_dist = 999.0
                    
                    for j in range(n_players):
                        if i == j:
                            continue
                        
                        pos_j = positions[j]
                        vel_j = velocities[j]
                        side_j = is_offense[j]
                        
                        dist = np.sqrt((pos_i[0] - pos_j[0])**2 + (pos_i[1] - pos_j[1])**2)
                        is_opponent = (side_i != side_j)
                        
                        if is_opponent:
                            opponent_dists.append(dist)
                            
                            # 跟踪最近对手
                            if dist < min_opp_dist:
                                min_opp_dist = dist
                                closest_opponent_idx = j
                            
                            # === 相对速度（改进版） ===
                            if dist < 20.0:
                                rel_vel = vel_i - vel_j
                                pos_diff = pos_j - pos_i
                                if dist > 1e-6:
                                    closing_vel = np.dot(rel_vel, pos_diff) / dist
                                    if dist == min(opponent_dists):
                                        relative_velocity_to_opponent[i] = closing_vel
                                        
                                        # === TTC (Time To Closest) ===
                                        if closing_vel > 0.1:  # 正在接近
                                            time_to_closest_opponent[i] = dist / closing_vel
                                        
                                        # === 方向威胁分数 ===
                                        # 考虑对手速度方向与我方位置的夹角
                                        speed_j = np.linalg.norm(vel_j)
                                        if speed_j > 0.5:
                                            vel_j_norm = vel_j / speed_j
                                            pos_diff_norm = pos_diff / (dist + 1e-6)
                                            alignment = np.dot(vel_j_norm, pos_diff_norm)
                                            # alignment接近1表示对手直冲我方
                                            threat = max(0, alignment) * speed_j / (dist + 1.0)
                                            directional_threat_score[i] = threat
                            
                            # 密度统计
                            if dist < 5.0:
                                opponent_density[i] += 1
                        else:
                            teammate_dists.append(dist)
                            if dist < 5.0:
                                teammate_density[i] += 1
                    
                    # 记录最小距离
                    if opponent_dists:
                        min_opponent_dist[i] = min(opponent_dists)
                    if teammate_dists:
                        min_teammate_dist[i] = min(teammate_dists)
                    
                    # === Voronoi 近似（简化版）===
                    # 控制区域 ≈ 到最近对手和队友距离的加权和
                    avg_neighbor_dist = 0.0
                    count = 0
                    for dist in opponent_dists[:3] + teammate_dists[:3]:  # 最近3个
                        if dist < 50:
                            avg_neighbor_dist += dist
                            count += 1
                    if count > 0:
                        voronoi_area[i] = (avg_neighbor_dist / count) ** 2 * 3.14159
                
                # 添加到DataFrame
                frame_df['min_opponent_distance'] = min_opponent_dist
                frame_df['min_teammate_distance'] = min_teammate_dist
                frame_df['relative_velocity_to_nearest_opponent'] = relative_velocity_to_opponent
                frame_df['opponent_density_5yd'] = opponent_density
                frame_df['teammate_density_5yd'] = teammate_density
                frame_df['time_to_closest_opponent'] = np.clip(time_to_closest_opponent, 0, 10)
                frame_df['directional_threat_score'] = directional_threat_score
                frame_df['voronoi_control_area'] = np.clip(voronoi_area, 0, 500)
                
                frame_features.append(frame_df)
            
            # 合并该play的所有帧
            play_combined = pd.concat(frame_features, ignore_index=True)
            
            # === 时序特征（跨帧计算）===
            play_combined = play_combined.sort_values(['nfl_id', 'frame_id'])
            
            # 对每个球员计算时序变化
            temporal_cols = ['opponent_density_change', 'threat_acceleration']
            for col in temporal_cols:
                play_combined[col] = 0.0
            
            for nfl_id in play_combined['nfl_id'].unique():
                mask = play_combined['nfl_id'] == nfl_id
                player_data = play_combined[mask].copy()
                
                if len(player_data) > 1:
                    # 对手密度变化率
                    density_diff = player_data['opponent_density_5yd'].diff().fillna(0)
                    play_combined.loc[mask, 'opponent_density_change'] = density_diff.values
                    
                    # 威胁加速度
                    threat_diff = player_data['directional_threat_score'].diff().fillna(0)
                    play_combined.loc[mask, 'threat_acceleration'] = threat_diff.values
            
            all_frames.append(play_combined)
        
        df = pd.concat(all_frames, ignore_index=True)
        
        new_cols = [
            'min_opponent_distance',
            'min_teammate_distance',
            'relative_velocity_to_nearest_opponent',
            'opponent_density_5yd',
            'teammate_density_5yd',
            'time_to_closest_opponent',           # 新增
            'directional_threat_score',           # 新增
            'voronoi_control_area',               # 新增
            'opponent_density_change',            # 新增（时序）
            'threat_acceleration'                 # 新增（时序）
        ]
        
        return df, new_cols

    def transform(self, df):
        df = df.copy().sort_values(['game_id','play_id','nfl_id','frame_id'])
        df = self._create_basic_features(df)

        print("\nStep 2/3: Adding selected advanced features...")
        for group_name in self.active_groups:
            if group_name in self.feature_creators:
                creator = self.feature_creators[group_name]
                df, new_cols = creator(df)
                self.created_feature_cols.extend(new_cols)
                print(f"  [+] Added '{group_name}' ({len(new_cols)} cols)")
            else:
                print(f"  [!] Unknown feature group: {group_name}")

        final_cols = sorted(set(self.created_feature_cols))
        print(f"\nTotal features created: {len(final_cols)}")
        return df, final_cols

In [None]:
# -------------------------------
# Model & training (same spirit as your version)
# -------------------------------
class TemporalHuber(nn.Module):
    def __init__(self, delta=0.5, time_decay=0.03, velocity_penalty_weight=0.01,
                 acceleration_penalty_weight=0.0, use_huber_for_penalty=True):
        """
        参数:
        delta: Huber损失的阈值
        time_decay: 时间衰减系数，越大则未来时刻权重越小
        velocity_penalty_weight: 速度变化惩罚权重（一阶差分）
        acceleration_penalty_weight: 加速度惩罚权重（二阶差分）
        use_huber_for_penalty: 是否对正则项也使用Huber损失
        """
        super().__init__()
        self.delta = delta
        self.time_decay = time_decay
        self.velocity_penalty_weight = velocity_penalty_weight
        self.acceleration_penalty_weight = acceleration_penalty_weight
        self.use_huber_for_penalty = use_huber_for_penalty
    
    def forward(self, pred, target, mask):
        err = pred - target
        abs_err = torch.abs(err)
        
        # ===== 主Huber损失 =====
        huber = torch.where(abs_err <= self.delta,
                           0.5 * err * err,
                           self.delta * (abs_err - 0.5 * self.delta))
        
        # 时间衰减权重
        if self.time_decay > 0:
            L = pred.size(1)
            t = torch.arange(L, device=pred.device).float()
            w = torch.exp(-self.time_decay * t).view(1, L)
            huber = huber * w
            mask_weighted = mask * w
        else:
            mask_weighted = mask
        
        main_loss = (huber * mask_weighted).sum() / (mask_weighted.sum() + 1e-8)
        
        # ===== 速度平滑正则项 =====
        velocity_penalty = 0.0
        if self.velocity_penalty_weight > 0 and pred.size(1) > 1:
            # 一阶差分（速度变化）
            velocity_diff = pred[:, 1:] - pred[:, :-1]
            mask_vel = mask[:, 1:]
            
            if self.use_huber_for_penalty:
                vel_abs = torch.abs(velocity_diff)
                vel_loss = torch.where(vel_abs <= self.delta,
                                      0.5 * velocity_diff * velocity_diff,
                                      self.delta * (vel_abs - 0.5 * self.delta))
            else:
                vel_loss = velocity_diff * velocity_diff
            
            # 应用时间衰减（可选）
            if self.time_decay > 0:
                L_vel = vel_loss.size(1)
                t_vel = torch.arange(L_vel, device=pred.device).float()
                w_vel = torch.exp(-self.time_decay * t_vel).view(1, L_vel)
                vel_loss = vel_loss * w_vel
                mask_vel = mask_vel * w_vel
            
            velocity_penalty = (vel_loss * mask_vel).sum() / (mask_vel.sum() + 1e-8)
        
        # ===== 加速度平滑正则项 =====
        acceleration_penalty = 0.0
        if self.acceleration_penalty_weight > 0 and pred.size(1) > 2:
            # 二阶差分（加速度变化）
            velocity_diff = pred[:, 1:] - pred[:, :-1]
            acceleration = velocity_diff[:, 1:] - velocity_diff[:, :-1]
            mask_acc = mask[:, 2:]
            
            if self.use_huber_for_penalty:
                acc_abs = torch.abs(acceleration)
                acc_loss = torch.where(acc_abs <= self.delta,
                                      0.5 * acceleration * acceleration,
                                      self.delta * (acc_abs - 0.5 * self.delta))
            else:
                acc_loss = acceleration * acceleration
            
            # 应用时间衰减（可选）
            if self.time_decay > 0:
                L_acc = acc_loss.size(1)
                t_acc = torch.arange(L_acc, device=pred.device).float()
                w_acc = torch.exp(-self.time_decay * t_acc).view(1, L_acc)
                acc_loss = acc_loss * w_acc
                mask_acc = mask_acc * w_acc
            
            acceleration_penalty = (acc_loss * mask_acc).sum() / (mask_acc.sum() + 1e-8)
        
        # ===== 组合损失 =====
        total_loss = (main_loss +
                     self.velocity_penalty_weight * velocity_penalty +
                     self.acceleration_penalty_weight * acceleration_penalty)
        
        return total_loss

class SeqModel(nn.Module):
    def __init__(self, input_dim, horizon):
        super().__init__()
        # 投影到可被num_heads整除的维度
        self.hidden_dim = 128
        self.input_proj = nn.Linear(input_dim, self.hidden_dim)
        
        # 位置编码（假设序列长度最大为10）
        self.pos_encoding = nn.Parameter(torch.randn(1, 10, self.hidden_dim) * 0.02)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.hidden_dim,
            nhead=4,
            dim_feedforward=256,
            dropout=0.1,
            activation='gelu',
            batch_first=True,
            norm_first=True  # Pre-LN更稳定
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
        
        # Pooling层
        self.pool_ln = nn.LayerNorm(self.hidden_dim)
        self.pool_attn = nn.MultiheadAttention(
            self.hidden_dim, 
            num_heads=4, 
            batch_first=True,
            dropout=0.1
        )
        self.pool_query = nn.Parameter(torch.randn(1, 1, self.hidden_dim))
        
        # 输出头 - 使用PReLU替代GELU
        self.head = nn.Sequential(
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.PReLU(num_parameters=self.hidden_dim),  # 每个通道独立学习负斜率
            nn.Dropout(0.2),
            nn.Linear(self.hidden_dim, horizon)
        )
    
    def forward(self, x):
        # x: (B, seq_len, input_dim)
        B, seq_len, _ = x.shape
        
        # 投影输入
        x = self.input_proj(x)  # (B, seq_len, hidden_dim)
        
        # 添加位置编码
        x = x + self.pos_encoding[:, :seq_len, :]
        
        # Transformer编码
        h = self.transformer(x)  # (B, seq_len, hidden_dim)
        
        # 注意力池化
        q = self.pool_query.expand(B, -1, -1)  # (B, 1, hidden_dim)
        h_norm = self.pool_ln(h)  # (B, seq_len, hidden_dim)
        ctx, _ = self.pool_attn(q, h_norm, h_norm)  # (B, 1, hidden_dim)
        
        # 预测
        out = self.head(ctx.squeeze(1))  # (B, horizon)
        
        # 累积和
        return torch.cumsum(out, dim=1)  # (B, horizon)

def prepare_targets(batch_axis, max_h):
    tensors, masks = [], []
    for arr in batch_axis:
        L = len(arr)
        padded = np.pad(arr, (0, max_h - L), constant_values=0).astype(np.float32)
        mask = np.zeros(max_h, dtype=np.float32)
        mask[:L] = 1.0
        tensors.append(torch.tensor(padded))
        masks.append(torch.tensor(mask))
    return torch.stack(tensors), torch.stack(masks)

def train_model(X_train, y_train, X_val, y_val, input_dim, horizon, config):
    device = config.DEVICE
    model = SeqModel(input_dim, horizon).to(device)
    criterion = TemporalHuber(delta=0.5, time_decay=0.03)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, verbose=False)

    # build batches (keep numpy → torch)
    def build_batches(X, Y):
        batches = []
        B = config.BATCH_SIZE
        for i in range(0, len(X), B):
            end = min(i + B, len(X))
            xs = torch.tensor(np.stack(X[i:end]).astype(np.float32))
            ys, ms = prepare_targets([Y[j] for j in range(i, end)], horizon)
            batches.append((xs, ys, ms))
        return batches

    tr_batches = build_batches(X_train, y_train)
    va_batches = build_batches(X_val,   y_val)

    best_loss, best_state, bad = float('inf'), None, 0
    for epoch in range(1, config.EPOCHS + 1):
        model.train()
        train_losses = []
        for bx, by, bm in tr_batches:
            bx, by, bm = bx.to(device), by.to(device), bm.to(device)
            pred = model(bx)
            loss = criterion(pred, by, bm)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad():
            for bx, by, bm in va_batches:
                bx, by, bm = bx.to(device), by.to(device), bm.to(device)
                pred = model(bx)
                val_losses.append(criterion(pred, by, bm).item())

        trl, val = float(np.mean(train_losses)), float(np.mean(val_losses))
        scheduler.step(val)
        if epoch % 10 == 0:
            print(f"  Epoch {epoch}: train={trl:.4f}, val={val:.4f}")

        if val < best_loss:
            best_loss, bad = val, 0
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        else:
            bad += 1
            if bad >= config.PATIENCE:
                print(f"  Early stop at epoch {epoch}")
                break

    if best_state:
        model.load_state_dict(best_state)
    return model, best_loss

In [None]:
# ------------------------------_
# Main pipeline (MODIFICADO PARA ENSEMBLE DE SEMILLAS)
# ------------------------------_
class CFG(Config):
    # Añadimos la lista de semillas para el ensemble
    SEEDS = [42, 19, 89,64] # ¡Puedes cambiar o añadir más semillas aquí!

def main():
    cfg = CFG()
    print("="*80)
    print(f"PASO 2: PIPELINE MEJORADO CON ENSEMBLE DE {len(cfg.SEEDS)} SEMILLAS + TTA SELECTIVO")
    print("="*80)
    print(f"Semillas a utilizar: {cfg.SEEDS}")
    print(f"cuDF backend activo? {USE_CUDF}")

    # [1/4] Carga de datos (se hace una sola vez)
    print("\n[1/4] Cargando datos...")
    train_input_files  = [cfg.DATA_DIR / f"train/input_2023_w{w:02d}.csv"  for w in range(1, 19)]
    train_output_files = [cfg.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_input  = pd.concat([pd.read_csv(f) for f in train_input_files  if f.exists()], ignore_index=True)
    train_output = pd.concat([pd.read_csv(f) for f in train_output_files if f.exists()], ignore_index=True)
    test_input     = pd.read_csv(cfg.DATA_DIR / "test_input.csv")
    test_template  = pd.read_csv(cfg.DATA_DIR / "test.csv")

    # [2/4] Preparación de secuencias (se hace una sola vez)
    print("\n[2/4] Construyendo secuencias con características AVANZADAS...")
    seqs, tdx, tdy, tfids, seq_meta, feat_cols, dir_map = prepare_sequences_with_advanced_features(
        train_input, output_df=train_output, is_training=True,
        window_size=cfg.WINDOW_SIZE
    )

    # numpy object arrays a listas para un manejo más fácil
    sequences = list(seqs)
    targets_dx = list(tdx)
    targets_dy = list(tdy)

    # [3/4] Entrenamiento con GroupKFold sobre múltiples semillas
    print("\n[3/4] Iniciando entrenamiento de ensemble...")
    
    # Contenedores para todos los modelos y escaladores de todas las ejecuciones
    all_models_x, all_models_y, all_scalers = [], [], []
    fold_rmse_list = []  # Para almacenar RMSE de cada fold
    
    # Diccionario para rastrear modelos por semilla
    seed_models = {}  # {seed: {'models_x': [], 'models_y': [], 'scalers': [], 'rmse_list': []}}
    
    groups = np.array([d['game_id'] for d in seq_meta])
    
    for seed in cfg.SEEDS:
        print(f"\n{'='*70}\n   Entrenando con Semilla (Seed): {seed}\n{'='*70}")
        set_seed(seed)
        
        # Inicializar contenedores para esta semilla
        seed_models[seed] = {
            'models_x': [],
            'models_y': [],
            'scalers': [],
            'rmse_list': []
        }
        
        gkf = GroupKFold(n_splits=cfg.N_FOLDS)

        for fold, (tr, va) in enumerate(gkf.split(sequences, groups=groups), 1):
            print(f"\n{'-'*60}\nFold {fold}/{cfg.N_FOLDS} para la semilla {seed}\n{'-'*60}")

            X_tr = [sequences[i] for i in tr]
            X_va = [sequences[i] for i in va]

            # Estandarización por fold
            scaler = StandardScaler()
            scaler.fit(np.vstack([s for s in X_tr]))

            X_tr_sc = np.stack([scaler.transform(s) for s in X_tr]).astype(np.float32)
            X_va_sc = np.stack([scaler.transform(s) for s in X_va]).astype(np.float32)

            # Entrenar modelo para X
            print("Entrenando modelo ΔX...")
            mx, loss_x = train_model(
                X_tr_sc, [targets_dx[i] for i in tr],
                X_va_sc, [targets_dx[i] for i in va],
                X_tr_sc.shape[-1], cfg.MAX_FUTURE_HORIZON, cfg
            )

            # Entrenar modelo para Y
            print("Entrenando modelo ΔY...")
            my, loss_y = train_model(
                X_tr_sc, [targets_dy[i] for i in tr],
                X_va_sc, [targets_dy[i] for i in va],
                X_tr_sc.shape[-1], cfg.MAX_FUTURE_HORIZON, cfg
            )
            
            # Guardar los modelos y el escalador de este fold
            all_models_x.append(mx)
            all_models_y.append(my)
            all_scalers.append(scaler)
            
            # También guardar en el diccionario de la semilla
            seed_models[seed]['models_x'].append(mx)
            seed_models[seed]['models_y'].append(my)
            seed_models[seed]['scalers'].append(scaler)
            
            # Calcular RMSE en el conjunto de validación
            mx.eval()
            my.eval()
            with torch.no_grad():
                X_va_t = torch.tensor(X_va_sc).to(cfg.DEVICE)
                pred_dx = mx(X_va_t).cpu().numpy()
                pred_dy = my(X_va_t).cpu().numpy()
            
            # Preparar targets de validación para RMSE
            y_va_dx = [targets_dx[i] for i in va]
            y_va_dy = [targets_dy[i] for i in va]
            
            # Calcular RMSE: sqrt(mean((x_pred - x_true)^2 + (y_pred - y_true)^2))
            squared_errors = []
            for i in range(len(pred_dx)):
                # Obtener targets reales con padding
                target_dx_full, mask_dx = prepare_targets([y_va_dx[i]], cfg.MAX_FUTURE_HORIZON)
                target_dy_full, mask_dy = prepare_targets([y_va_dy[i]], cfg.MAX_FUTURE_HORIZON)
                
                target_dx_arr = target_dx_full[0].cpu().numpy()
                target_dy_arr = target_dy_full[0].cpu().numpy()
                mask_arr = mask_dx[0].cpu().numpy()
                
                # Solo calcular error en posiciones válidas (mask == 1)
                valid_indices = mask_arr > 0
                if valid_indices.sum() > 0:
                    dx_error = (pred_dx[i][valid_indices] - target_dx_arr[valid_indices]) ** 2
                    dy_error = (pred_dy[i][valid_indices] - target_dy_arr[valid_indices]) ** 2
                    squared_errors.extend(dx_error + dy_error)
            
            fold_rmse = np.sqrt(np.mean(squared_errors))
            fold_rmse_list.append(fold_rmse)
            seed_models[seed]['rmse_list'].append(fold_rmse)
            
            print(f"Fold {fold} (semilla {seed}) — val loss: dx={loss_x:.5f}, dy={loss_y:.5f} | RMSE={fold_rmse:.5f}")

    # Calcular RMSE promedio por semilla e identificar la mejor
    print("\n" + "="*80)
    print("ESTADÍSTICAS DE RMSE POR SEMILLA")
    print("="*80)
    seed_avg_rmse = {}
    for seed in cfg.SEEDS:
        avg_rmse = np.mean(seed_models[seed]['rmse_list'])
        std_rmse = np.std(seed_models[seed]['rmse_list'])
        seed_avg_rmse[seed] = avg_rmse
        print(f"Semilla {seed}: RMSE Promedio = {avg_rmse:.5f} ± {std_rmse:.5f}")
    
    best_seed = min(seed_avg_rmse, key=seed_avg_rmse.get)
    print("-"*80)
    print(f"✓ MEJOR SEMILLA: {best_seed} con RMSE = {seed_avg_rmse[best_seed]:.5f}")
    print("  (Se aplicará TTA solo a los modelos de esta semilla)")
    print("="*80)
    
    # Calcular estadísticas generales de RMSE
    rmse_mean = np.mean(fold_rmse_list)
    rmse_std = np.std(fold_rmse_list)
    
    print("\n" + "="*80)
    print("ESTADÍSTICAS DE RMSE POR FOLD (TODOS)")
    print("="*80)
    for i, rmse in enumerate(fold_rmse_list, 1):
        print(f"Fold {i}: RMSE = {rmse:.5f}")
    print("-"*80)
    print(f"RMSE Promedio Global: {rmse_mean:.5f}")
    print(f"RMSE Desviación Estándar Global: {rmse_std:.5f}")
    print("="*80)

    # [4/4] Inferencia sobre el test usando todos los modelos entrenados + TTA selectivo
    print(f"\n[4/4] Inferencia y submission con ensemble de {len(all_models_x)} modelos + TTA en mejor semilla...")
    test_seqs, test_meta, feat_cols_t, dir_map_test = prepare_sequences_with_advanced_features(
        test_input, test_template=test_template, is_training=False,
        window_size=cfg.WINDOW_SIZE
    )
    assert feat_cols_t == feat_cols, "¡Las columnas de características de Train/Test no coinciden!"

    idx_x = feat_cols.index('x')
    idx_y = feat_cols.index('y')

    X_test_raw = list(test_seqs)
    x_last_uni = np.array([s[-1, idx_x] for s in X_test_raw], dtype=np.float32)
    y_last_uni = np.array([s[-1, idx_y] for s in X_test_raw], dtype=np.float32)

    # Predicciones normales de todos los modelos (peso 1.0 cada uno)
    all_preds_dx, all_preds_dy = [], []
    
    print(f"\n--- Generando predicciones normales de {len(all_models_x)} modelos...")
    for mx, my, sc in zip(all_models_x, all_models_y, all_scalers):
        X_sc = np.stack([sc.transform(s) for s in X_test_raw]).astype(np.float32)
        X_t = torch.tensor(X_sc).to(cfg.DEVICE)
        mx.eval()
        my.eval()
        with torch.no_grad():
            all_preds_dx.append(mx(X_t).cpu().numpy())
            all_preds_dy.append(my(X_t).cpu().numpy())

    # TTA solo para la mejor semilla (peso reducido)
    tta_weight = 0.15  # Peso por cada aumentación de TTA (ajustable)
    tta_augmentations = [
        ('noise', 0.01),      # Ruido gaussiano pequeño
        ('temporal_shift', 1), # Desplazamiento temporal de 1 frame
        ('speed_scale', 1.05)  # Escala de velocidad 5%
    ]
    
    print(f"\n--- Aplicando TTA a modelos de la mejor semilla ({best_seed})...")
    print(f"    Peso de cada aumentación TTA: {tta_weight:.3f}")
    print(f"    Aumentaciones: {len(tta_augmentations)}")
    
    best_models_x = seed_models[best_seed]['models_x']
    best_models_y = seed_models[best_seed]['models_y']
    best_scalers = seed_models[best_seed]['scalers']
    
    tta_preds_dx, tta_preds_dy = [], []
    
    for aug_name, aug_param in tta_augmentations:
        print(f"    Procesando aumentación: {aug_name} (param={aug_param})...")
        
        # Aplicar aumentación a las secuencias de test
        X_test_aug = []
        for seq in X_test_raw:
            if aug_name == 'noise':
                # Añadir ruido gaussiano
                noise = np.random.randn(*seq.shape) * aug_param
                aug_seq = seq + noise
            elif aug_name == 'temporal_shift':
                # Desplazar temporalmente (eliminar primeros frames, duplicar últimos)
                shift = int(aug_param)
                aug_seq = np.vstack([seq[shift:], np.tile(seq[-1:], (shift, 1))])
            elif aug_name == 'speed_scale':
                # Escalar componentes de velocidad
                aug_seq = seq.copy()
                if 'vx' in feat_cols and 'vy' in feat_cols:
                    idx_vx = feat_cols.index('vx')
                    idx_vy = feat_cols.index('vy')
                    aug_seq[:, idx_vx] *= aug_param
                    aug_seq[:, idx_vy] *= aug_param
                if 'ax' in feat_cols and 'ay' in feat_cols:
                    idx_ax = feat_cols.index('ax')
                    idx_ay = feat_cols.index('ay')
                    aug_seq[:, idx_ax] *= aug_param
                    aug_seq[:, idx_ay] *= aug_param
            else:
                aug_seq = seq
            
            X_test_aug.append(aug_seq)
        
        # Generar predicciones con TTA para cada modelo de la mejor semilla
        for mx, my, sc in zip(best_models_x, best_models_y, best_scalers):
            X_sc = np.stack([sc.transform(s) for s in X_test_aug]).astype(np.float32)
            X_t = torch.tensor(X_sc).to(cfg.DEVICE)
            mx.eval()
            my.eval()
            with torch.no_grad():
                tta_preds_dx.append(mx(X_t).cpu().numpy())
                tta_preds_dy.append(my(X_t).cpu().numpy())

    # Ensemble ponderado: predicciones normales (peso 1.0) + TTA (peso reducido)
    print(f"\n--- Combinando predicciones...")
    print(f"    Predicciones normales: {len(all_preds_dx)} modelos × peso 1.0")
    print(f"    Predicciones TTA: {len(tta_preds_dx)} aumentaciones × peso {tta_weight:.3f}")
    
    # Calcular suma ponderada
    total_weight = len(all_preds_dx) * 1.0 + len(tta_preds_dx) * tta_weight
    
    # Sumar predicciones normales
    ens_dx = np.sum(all_preds_dx, axis=0) * 1.0
    ens_dy = np.sum(all_preds_dy, axis=0) * 1.0
    
    # Sumar predicciones TTA
    if len(tta_preds_dx) > 0:
        ens_dx += np.sum(tta_preds_dx, axis=0) * tta_weight
        ens_dy += np.sum(tta_preds_dy, axis=0) * tta_weight
    
    # Normalizar por peso total
    ens_dx /= total_weight
    ens_dy /= total_weight
    
    H = ens_dx.shape[1]

    # Construcción de las filas para la submission, con inversión para jugadas a la derecha
    rows = []
    tt_idx = test_template.set_index(['game_id','play_id','nfl_id']).sort_index()

    for i, meta in enumerate(test_meta):
        gid = meta['game_id']; pid = meta['play_id']; nid = meta['nfl_id']
        play_dir = meta['play_direction']
        play_is_right = (play_dir == 'right')

        try:
            fids = tt_idx.loc[(gid,pid,nid),'frame_id']
            if isinstance(fids, pd.Series):
                fids = fids.sort_values().tolist()
            else:
                fids = [int(fids)]
        except KeyError:
            continue

        for t, fid in enumerate(fids):
            tt = min(t, H - 1)
            x_uni = np.clip(x_last_uni[i] + ens_dx[i, tt], 0, FIELD_LENGTH)
            y_uni = np.clip(y_last_uni[i] + ens_dy[i, tt], 0, FIELD_WIDTH)
            x_out, y_out = invert_to_original_direction(x_uni, y_uni, play_is_right)

            rows.append({
                'id': f"{gid}_{pid}_{nid}_{int(fid)}",
                'x': x_out,
                'y': y_out
            })

    submission = pd.DataFrame(rows)
    submission.to_csv("submission.csv", index=False)
    print("\n" + "="*80)
    print("¡PASO 2 COMPLETO CON TTA SELECTIVO!")
    print("="*80)
    print(f"✓ Submission guardada en submission.csv  |  Filas: {len(submission)}")
    print(f"Total de modelos base en ensemble: {len(all_models_x)}")
    print(f"Modelos con TTA (semilla {best_seed}): {len(best_models_x)} × {len(tta_augmentations)} aumentaciones")
    print(f"Peso efectivo TTA vs normal: {tta_weight:.3f} : 1.0")
    print(f"Características utilizadas: {len(feat_cols)}  (cuDF activo: {USE_CUDF})")

if __name__ == "__main__":
    main()
