In [None]:
# ======================================================
# NFL Big Data Bowl 2026: ML-enhanced PTMIS
# Script lengkap dari input hingga ptmis_ml_output.csv
# ======================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# -------------------------------
# 1. Load cleaned dataset
# -------------------------------
df = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-analytics-1/merged_cleaned_features.csv", low_memory=False)

# -------------------------------
# 2. Fix data types for ML
# -------------------------------
# Convert player_height/weight to numeric
df['player_height'] = pd.to_numeric(df['player_height'], errors='coerce')
df['player_weight'] = pd.to_numeric(df['player_weight'], errors='coerce')

# Fill missing values
df = df.fillna(0)

# -------------------------------
# 3. Feature selection
# -------------------------------
features = [
    'x_input','y_input','s','s_rolling_3','a','dir','o',
    'absolute_yardline_number','player_height','player_weight',
    'score_difference','distance_to_ball','week'
]

target_x = 'x_output'
target_y = 'y_output'

df_ml = df.dropna(subset=[target_x, target_y])

X = df_ml[features]
y_x = df_ml[target_x]
y_y = df_ml[target_y]

# -------------------------------
# 4. Train-test split
# -------------------------------
X_train, X_test, yx_train, yx_test, yy_train, yy_test = train_test_split(
    X, y_x, y_y, test_size=0.2, random_state=42
)

# -------------------------------
# 5. Train LightGBM Regressor
# -------------------------------
def train_lgb_regressor(X_train, y_train):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'n_estimators': 500,
        'num_leaves': 31,
        'random_state': 42,
        'n_jobs': -1
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    return model

model_x = train_lgb_regressor(X_train, yx_train)
model_y = train_lgb_regressor(X_train, yy_train)

# -------------------------------
# 6. Predict on entire dataset
# -------------------------------
df_ml['pred_x'] = model_x.predict(X)
df_ml['pred_y'] = model_y.predict(X)

# -------------------------------
# 7. Compute PTMIS_score
# -------------------------------
df_ml['PTMIS_score'] = np.sqrt(
    (df_ml['pred_x'] - df_ml['x_output'])**2 +
    (df_ml['pred_y'] - df_ml['y_output'])**2
)

# -------------------------------
# 8. Keep only necessary columns for ptmis_ml_output.csv
# -------------------------------
ptmis_ml_output = df_ml[['game_id','play_id','nfl_id','PTMIS_score']].copy()

# -------------------------------
# 9. Save CSV
# -------------------------------
ptmis_ml_output.to_csv("ptmis_ml_output.csv", index=False)
print("âœ… ptmis_ml_output.csv berhasil dibuat dengan kolom:")
print(ptmis_ml_output.columns.tolist())
print(f"Jumlah baris: {ptmis_ml_output.shape[0]}")
