In [None]:
# ----------------------------------------
# 1. Import libraries
# ----------------------------------------
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ----------------------------------------
# 2. Load dataset
# ----------------------------------------
usecols = [
    'game_id','play_id','nfl_id','player_name','player_position','possession_team',
    'x_input','y_input','s','s_rolling_3','a','dir','x_output','y_output'
]

df = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-analytics-1/merged_cleaned_features.csv",
                 usecols=usecols, low_memory=False)

# ----------------------------------------
# 3. Sampling
# ----------------------------------------
sample_games = df['game_id'].unique()[:100]
df = df[df['game_id'].isin(sample_games)]

# ----------------------------------------
# 4. Type data
# ----------------------------------------
float_cols = ['x_input','y_input','s','s_rolling_3','a','dir','x_output','y_output']
df[float_cols] = df[float_cols].astype('float32')

# ----------------------------------------
# 5. Feature & target
# ----------------------------------------
features = ['x_input','y_input','s','s_rolling_3','a','dir']
target_x = 'x_output'
target_y = 'y_output'

X = df[features]
y_x = df[target_x]
y_y = df[target_y]

# ----------------------------------------
# 6. Train-test split (80/20)
# ----------------------------------------
from sklearn.model_selection import train_test_split
X_train, X_test, yx_train, yx_test, yy_train, yy_test = train_test_split(
    X, y_x, y_y, test_size=0.2, random_state=42
)

# ----------------------------------------
# 7. Train LightGBM Regressor
# ----------------------------------------
def train_lgb_regressor(X_train, y_train):
    params = {
        'objective': 'regression',
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 6,
        'random_state': 42
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    return model

model_x = train_lgb_regressor(X_train, yx_train)
model_y = train_lgb_regressor(X_train, yy_train)

# ----------------------------------------
# 8. Predict PTMIS per frame
# ----------------------------------------
pred_x = model_x.predict(X)
pred_y = model_y.predict(X)

df['PTMIS_score'] = np.sqrt((pred_x - df['x_output'])**2 + (pred_y - df['y_output'])**2)

# ----------------------------------------
# 9. Save ML-Enhanced PTMIS
# ----------------------------------------
output_file = "ptmis_ml_output.csv"
df_ml_output = df[['game_id','play_id','nfl_id','PTMIS_score']]
df_ml_output.to_csv(output_file, index=False)
print(f"ML-Enhanced PTMIS saved to {output_file}")

# ----------------------------------------
# 10. Interpretable Dashboard
# ----------------------------------------
os.makedirs("plots", exist_ok=True)

# Histogram PTMIS
plt.figure(figsize=(8,5))
sns.histplot(df_ml_output['PTMIS_score'], bins=50)
plt.title("Distribution of PTMIS Score")
plt.xlabel("PTMIS Score")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("plots/ptmis_histogram.png")
plt.show()

# Boxplot PTMIS by Position
df_sample_positions = df[['player_position','PTMIS_score']].copy()
plt.figure(figsize=(10,6))
sns.boxplot(x='player_position', y='PTMIS_score', data=df_sample_positions)
plt.xticks(rotation=45)
plt.title("PTMIS Score by Player Position")
plt.tight_layout()
plt.savefig("plots/ptmis_by_position.png")
plt.show()

# Trajectory Plot
sample_play = df['play_id'].iloc[0]
traj = df[df['play_id'] == sample_play]
plt.figure(figsize=(8,6))
for pid, group in traj.groupby('nfl_id'):
    plt.plot(group['x_input'], group['y_input'], alpha=0.7)
plt.title(f"Player Trajectories (Play ID={sample_play})")
plt.xlabel("X Position")
plt.ylabel("Y Position")
plt.tight_layout()
plt.savefig("plots/trajectory_play.png")
plt.show()

print("Plots saved in folder: plots")
