In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import warnings
warnings.filterwarnings('ignore')

from multiprocessing import Pool, cpu_count
from tqdm.auto import tqdm
import pickle

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
# CatBoost đã được loại bỏ
# from catboost import CatBoostRegressor


BASE_DIR = Path('/kaggle/input/nfl-big-data-bowl-2026-prediction')
FIELD_X_MAX, FIELD_Y_MAX = 120.0, 53.3
TRAIN_VAL_SPLIT = 0.90
N_FOLDS = 5
RANDOM_STATE = 42


def load_weekly_data(week_num):
    input_path = BASE_DIR / f'train/input_2023_w{week_num:02d}.csv'
    output_path = BASE_DIR / f'train/output_2023_w{week_num:02d}.csv'
    return pd.read_csv(input_path), pd.read_csv(output_path)


def load_all_training_data():
    train_input_files = [BASE_DIR / f'train/input_2023_w{w:02d}.csv' for w in range(1, 19)]
    train_output_files = [BASE_DIR / f'train/output_2023_w{w:02d}.csv' for w in range(1, 19)]
    
    train_input_files = [f for f in train_input_files if f.exists()]
    train_output_files = [f for f in train_output_files if f.exists()]
    
    print(f'Found {len(train_input_files)} weeks of data')
    
    with Pool(min(cpu_count(), 18)) as pool:
        results = list(tqdm(pool.imap(load_weekly_data, range(1, 19)), 
                           total=18, desc='Loading data'))
    
    input_dfs = [r[0] for r in results]
    output_dfs = [r[1] for r in results]
    
    input_data = pd.concat(input_dfs, ignore_index=True)
    output_data = pd.concat(output_dfs, ignore_index=True)
    
    test_input = pd.read_csv(BASE_DIR / 'test_input.csv')
    test_template = pd.read_csv(BASE_DIR / 'test.csv')
    
    print(f'Input: {input_data.shape}, Output: {output_data.shape}')
    return input_data, output_data, test_input, test_template


def create_physics_features(df):
    df = df.copy()
    
    dir_rad = np.radians(df['dir'])
    df['velocity_x'] = df['s'] * np.cos(dir_rad)
    df['velocity_y'] = df['s'] * np.sin(dir_rad)
    
    df['acceleration_x'] = df['a'] * np.cos(dir_rad)
    df['acceleration_y'] = df['a'] * np.sin(dir_rad)
    
    df['dist_to_ball'] = np.sqrt(
        (df['x'] - df['ball_land_x'])**2 + 
        (df['y'] - df['ball_land_y'])**2
    )
    
    df['angle_to_ball'] = np.arctan2(
        df['ball_land_y'] - df['y'],
        df['ball_land_x'] - df['x']
    )
    
    df['velocity_toward_ball'] = (
        df['velocity_x'] * np.cos(df['angle_to_ball']) + 
        df['velocity_y'] * np.sin(df['angle_to_ball'])
    )
    
    df['time_to_ball'] = df['num_frames_output'] / 10.0
    
    df['expected_x_at_ball'] = df['x'] + df['velocity_x'] * df['time_to_ball']
    df['expected_y_at_ball'] = df['y'] + df['velocity_y'] * df['time_to_ball']
    
    df['error_from_ball_x'] = df['expected_x_at_ball'] - df['ball_land_x']
    df['error_from_ball_y'] = df['expected_y_at_ball'] - df['ball_land_y']
    df['error_from_ball'] = np.sqrt(
        df['error_from_ball_x']**2 + df['error_from_ball_y']**2
    )
    
    return df


def create_player_features(df):
    df = df.copy()
    
    df['orientation_diff'] = np.abs(df['o'] - df['dir'])
    df['orientation_diff'] = np.minimum(df['orientation_diff'], 360 - df['orientation_diff'])
    
    df['angle_diff'] = np.abs(df['o'] - np.degrees(df['angle_to_ball']))
    df['angle_diff'] = np.minimum(df['angle_diff'], 360 - df['angle_diff'])
    
    height_parts = df['player_height'].str.split('-', expand=True)
    df['height_inches'] = height_parts[0].astype(float) * 12 + height_parts[1].astype(float)
    df['bmi'] = (df['player_weight'] / (df['height_inches']**2)) * 703
    
    df['momentum_x'] = df['player_weight'] * df['velocity_x']
    df['momentum_y'] = df['player_weight'] * df['velocity_y']
    df['kinetic_energy'] = 0.5 * df['player_weight'] * df['s']**2
    
    df['role_targeted_receiver'] = (df['player_role'] == 'Targeted Receiver').astype(int)
    df['role_defensive_coverage'] = (df['player_role'] == 'Defensive Coverage').astype(int)
    df['role_passer'] = (df['player_role'] == 'Passer').astype(int)
    df['side_offense'] = (df['player_side'] == 'Offense').astype(int)
    
    return df


def create_derived_features(df):
    df = df.copy()
    
    df['distance_to_target_x'] = df['ball_land_x'] - df['x']
    df['distance_to_target_y'] = df['ball_land_y'] - df['y']
    
    df['speed_squared'] = df['s'] ** 2
    df['accel_magnitude'] = np.sqrt(df['acceleration_x']**2 + df['acceleration_y']**2)
    df['velocity_alignment'] = np.cos(df['angle_to_ball'] - np.radians(df['dir']))
    
    df['time_squared'] = df['time_to_ball'] ** 2
    df['dist_squared'] = df['dist_to_ball'] ** 2
    df['weighted_dist_by_time'] = df['dist_to_ball'] / (df['time_to_ball'] + 0.1)
    
    return df


def create_sequence_features(df):
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    group_cols = ['game_id', 'play_id', 'nfl_id']
    
    lag_features = ['x', 'y', 'velocity_x', 'velocity_y', 's', 'a']
    for lag in range(1, 6):
        for col in lag_features:
            if col in df.columns:
                df[f'{col}_lag{lag}'] = df.groupby(group_cols)[col].shift(lag)
    
    rolling_features = ['x', 'y', 'velocity_x', 'velocity_y', 's']
    for window in [3, 5]:
        for col in rolling_features:
            if col in df.columns:
                rolling_mean = df.groupby(group_cols)[col].rolling(
                    window, min_periods=1
                ).mean().reset_index(level=[0,1,2], drop=True)
                
                rolling_std = df.groupby(group_cols)[col].rolling(
                    window, min_periods=1
                ).std().reset_index(level=[0,1,2], drop=True)
                
                df[f'{col}_rolling_mean_{window}'] = rolling_mean
                df[f'{col}_rolling_std_{window}'] = rolling_std
    
    for col in ['velocity_x', 'velocity_y']:
        if col in df.columns:
            df[f'{col}_delta'] = df.groupby(group_cols)[col].diff()
    
    return df


def create_training_dataset(input_df, output_df):
    output_df = output_df.copy()
    output_df['id'] = (
        output_df['game_id'].astype(str) + '_' + 
        output_df['play_id'].astype(str) + '_' + 
        output_df['nfl_id'].astype(str) + '_' + 
        output_df['frame_id'].astype(str)
    )
    output_df = output_df.rename(columns={'x': 'target_x', 'y': 'target_y'})
    
    input_agg = input_df.groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
    if 'frame_id' in input_agg.columns:
        input_agg = input_agg.drop('frame_id', axis=1)
    
    merged = output_df.merge(
        input_agg,
        on=['game_id', 'play_id', 'nfl_id'],
        how='left',
        suffixes=('', '_input')
    )
    
    return merged


# THAY ĐỔI 1: Loại bỏ CatBoost khỏi hàm get_base_models
def get_base_models():
    lgbm = LGBMRegressor(
        n_estimators=1500, learning_rate=0.03, max_depth=12, num_leaves=150,
        subsample=0.85, colsample_bytree=0.85, min_child_samples=50,
        reg_alpha=0.1, reg_lambda=0.1, n_jobs=-1, verbose=-1, random_state=RANDOM_STATE
    )
    
    xgb = XGBRegressor(
        n_estimators=1500, learning_rate=0.03, max_depth=10, min_child_weight=50,
        subsample=0.85, colsample_bytree=0.85, reg_alpha=0.1, reg_lambda=0.1,
        tree_method='hist', n_jobs=-1, verbosity=0, random_state=RANDOM_STATE
    )
    
    # CatBoost đã bị loại bỏ
    return {'lgbm': lgbm, 'xgb': xgb}


def train_stacking_ensemble(X, y, X_val, n_folds=N_FOLDS):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    base_models = get_base_models()
    
    oof_train = {k: np.zeros(len(X)) for k in base_models}
    test_preds = {k: [] for k in base_models}
    
    print(f'Training {n_folds}-fold stacking ensemble')
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_tr, X_fold_val = X[train_idx], X[val_idx]
        y_tr, y_fold_val = y[train_idx], y[val_idx]
        
        for name, model in base_models.items():
            model.fit(X_tr, y_tr)
            oof_train[name][val_idx] = model.predict(X_fold_val)
            test_preds[name].append(model.predict(X_val))
            
            fold_rmse = np.sqrt(mean_squared_error(y_fold_val, oof_train[name][val_idx]))
            print(f'  Fold {fold} {name.upper()}: {fold_rmse:.4f}')
    
    for name in base_models:
        test_preds[name] = np.mean(test_preds[name], axis=0)
    
    meta_X_train = np.column_stack([oof_train[k] for k in base_models])
    meta_X_val = np.column_stack([test_preds[k] for k in base_models])
    
    oof_rmse = {k: np.sqrt(mean_squared_error(y, oof_train[k])) for k in base_models}
    print(f'OOF RMSE - LGBM: {oof_rmse["lgbm"]:.4f}, XGB: {oof_rmse["xgb"]:.4f}')
    
    return meta_X_train, meta_X_val


def main():
    print('NFL Big Data Bowl 2026 - Stacking Ensemble Pipeline (Fixed)')
    print(f'CPU cores: {cpu_count()}')
    
    input_data, output_data, test_input, test_template = load_all_training_data()
    
    print('\nFeature engineering')
    input_features = create_physics_features(input_data)
    input_features = create_player_features(input_features)
    input_features = create_derived_features(input_features)
    input_features = create_sequence_features(input_features)
    
    print(f'Total features: {input_features.shape[1]}')
    
    train_df = create_training_dataset(input_features, output_data)
    print(f'Training dataset: {train_df.shape}')
    
    feature_cols = [
        'x', 'y', 's', 'a', 'o', 'dir', 'velocity_x', 'velocity_y',
        'dist_to_ball', 'angle_to_ball', 'velocity_toward_ball', 'time_to_ball',
        'orientation_diff', 'role_targeted_receiver', 'role_defensive_coverage',
        'role_passer', 'side_offense', 'height_inches', 'player_weight', 'bmi',
        'ball_land_x', 'ball_land_y', 'num_frames_output', 'frame_id',
        'acceleration_x', 'acceleration_y', 'distance_to_target_x', 'distance_to_target_y',
        'speed_squared', 'accel_magnitude', 'velocity_alignment',
        'expected_x_at_ball', 'expected_y_at_ball',
        'error_from_ball_x', 'error_from_ball_y', 'error_from_ball',
        'momentum_x', 'momentum_y', 'kinetic_energy',
        'angle_diff', 'time_squared', 'dist_squared', 'weighted_dist_by_time'
    ]
    
    for lag in range(1, 6):
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's', 'a']:
            feature_cols.append(f'{col}_lag{lag}')
    
    for window in [3, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's']:
            feature_cols.extend([
                f'{col}_rolling_mean_{window}',
                f'{col}_rolling_std_{window}'
            ])
    
    feature_cols.extend(['velocity_x_delta', 'velocity_y_delta'])
    
    available_features = [col for col in feature_cols if col in train_df.columns]
    print(f'Available features: {len(available_features)}')
    
    train_df = train_df.dropna(subset=available_features + ['target_x', 'target_y'])
    print(f'Training samples: {train_df.shape[0]:,}')
    
    X = train_df[available_features].values
    y_x = train_df['target_x'].values
    y_y = train_df['target_y'].values
    
    split_idx = int(len(train_df) * TRAIN_VAL_SPLIT)
    X_train, X_val = X[:split_idx], X[split_idx:]
    y_x_train, y_x_val = y_x[:split_idx], y_x[split_idx:]
    y_y_train, y_y_val = y_y[:split_idx], y_y[split_idx:]
    
    print(f'\nTrain: {X_train.shape[0]:,}, Validation: {X_val.shape[0]:,}')
    
    print('\nTraining X coordinate ensemble')
    meta_X_train_x, meta_X_val_x = train_stacking_ensemble(X_train, y_x_train, X_val)
    
    meta_model_x = Ridge(alpha=1.0)
    meta_model_x.fit(meta_X_train_x, y_x_train)
    
    pred_x_val = np.clip(meta_model_x.predict(meta_X_val_x), 0, FIELD_X_MAX)
    stacking_rmse_x = np.sqrt(mean_squared_error(y_x_val, pred_x_val))
    print(f'Stacking X RMSE: {stacking_rmse_x:.4f}')
    
    print('\nTraining Y coordinate ensemble')
    meta_X_train_y, meta_X_val_y = train_stacking_ensemble(X_train, y_y_train, X_val)
    
    meta_model_y = Ridge(alpha=1.0)
    meta_model_y.fit(meta_X_train_y, y_y_train)
    
    pred_y_val = np.clip(meta_model_y.predict(meta_X_val_y), 0, FIELD_Y_MAX)
    stacking_rmse_y = np.sqrt(mean_squared_error(y_y_val, pred_y_val))
    print(f'Stacking Y RMSE: {stacking_rmse_y:.4f}')
    
    final_rmse = np.sqrt(0.5 * (stacking_rmse_x**2 + stacking_rmse_y**2))
    
    print(f'\nFinal Results')
    print(f'X Coordinate RMSE: {stacking_rmse_x:.4f}')
    print(f'Y Coordinate RMSE: {stacking_rmse_y:.4f}')
    print(f'Combined RMSE: {final_rmse:.4f}')
    print(f'Baseline LGBM: 0.7570')
    print(f'Improvement: {((0.7570 - final_rmse) / 0.7570 * 100):.2f}%')
    
    with open('stacking_models.pkl', 'wb') as f:
        pickle.dump({
            'meta_model_x': meta_model_x,
            'meta_model_y': meta_model_y,
            'features': available_features,
            'rmse': final_rmse
        }, f)
    
    print('\nGenerating submission')
    test_features = create_physics_features(test_input)
    test_features = create_player_features(test_features)
    test_features = create_derived_features(test_features)
    test_features = create_sequence_features(test_features)
    
    test_agg = test_features.groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
    if 'frame_id' in test_agg.columns:
        test_agg = test_agg.drop('frame_id', axis=1)
    
    test_merged = test_template.merge(
        test_agg, on=['game_id', 'play_id', 'nfl_id'], how='left'
    )
    test_merged['id'] = (
        test_merged['game_id'].astype(str) + '_' + 
        test_merged['play_id'].astype(str) + '_' + 
        test_merged['nfl_id'].astype(str) + '_' + 
        test_merged['frame_id'].astype(str)
    )
    
    for col in available_features:
        if col not in test_merged.columns:
            test_merged[col] = 0
    
    X_test = test_merged[available_features].fillna(0).values

    # THAY ĐỔI 2: Sửa logic tạo dự đoán cho tập test
    print('\nRetraining base models for X coordinate on full training data')
    base_models_x = get_base_models()
    for name, model in base_models_x.items():
        model.fit(X_train, y_x_train)
    
    meta_X_test_x = np.column_stack([model.predict(X_test) for model in base_models_x.values()])
    
    print('\nRetraining base models for Y coordinate on full training data')
    base_models_y = get_base_models()
    for name, model in base_models_y.items():
        model.fit(X_train, y_y_train)
        
    meta_X_test_y = np.column_stack([model.predict(X_test) for model in base_models_y.values()])
    
    pred_x_test = np.clip(meta_model_x.predict(meta_X_test_x), 0, FIELD_X_MAX)
    pred_y_test = np.clip(meta_model_y.predict(meta_X_test_y), 0, FIELD_Y_MAX)
    
    submission = pd.DataFrame({
        'id': test_merged['id'],
        'x': pred_x_test,
        'y': pred_y_test
    })
    
    submission.to_csv('submission.csv', index=False)
    print(f'Submission saved: {submission.shape}')
    
    return final_rmse


if __name__ == '__main__':
    final_rmse = main()
    print(f'\nFinal Validation RMSE: {final_rmse:.4f}')