In [None]:
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt

# -------------------------------
# 1. Paths (adjust according to Kaggle)
# -------------------------------
train_path = "/kaggle/input/nfl-big-data-bowl-2026-analytics/114239_nfl_competition_files_published_analytics_final/train"
supp_path = "/kaggle/input/nfl-big-data-bowl-2026-analytics/114239_nfl_competition_files_published_analytics_final/supplementary_data.csv"

output_plot_dir = "/kaggle/working/plots"
os.makedirs(output_plot_dir, exist_ok=True)

# -------------------------------
# 2. Helper Function
# -------------------------------
def extract_week(file_path):
    """Extract week number from file name: _w01.csv â†’ 1"""
    return int(os.path.basename(file_path).split('_w')[1].split('.csv')[0])

# -------------------------------
# 3. Combine all weekly input/output files
# -------------------------------
# Input
input_files = sorted(glob.glob(os.path.join(train_path, "input_2023_w*.csv")))
if len(input_files) == 0:
    raise ValueError("No input files found in the train folder!")

df_input = pd.concat([pd.read_csv(f).assign(week=extract_week(f)) for f in input_files], ignore_index=True)

# Output
output_files = sorted(glob.glob(os.path.join(train_path, "output_2023_w*.csv")))
if len(output_files) == 0:
    raise ValueError("No output files found in the train folder!")

df_output = pd.concat([pd.read_csv(f).assign(week=extract_week(f)) for f in output_files], ignore_index=True)

# Supplementary Data
df_supp = pd.read_csv(supp_path, low_memory=False)

# -------------------------------
# 4. Merge Input + Output + Supplementary
# -------------------------------
df = df_input.merge(
    df_output,
    on=['game_id','play_id','nfl_id','frame_id','week'],
    how='left',
    suffixes=('_input','_output')
)

df = df.merge(df_supp, on=['game_id','play_id'], how='left')

# Ensure 'week' column exists
if 'week' not in df.columns:
    df['week'] = df_input['week']

print("Merged data shape:", df.shape)

# -------------------------------
# 5. Data Cleaning
# -------------------------------
df = df.drop_duplicates()
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(0)

# Replace inf/-inf with NaN to avoid seaborn warnings
df.replace([float('inf'), -float('inf')], pd.NA, inplace=True)

# -------------------------------
# 6. Feature Engineering (rolling speed)
# -------------------------------
df.sort_values(['game_id','play_id','nfl_id','frame_id'], inplace=True)

# Ensure 's' column is numeric
df['s'] = pd.to_numeric(df['s'], errors='coerce').fillna(0)

# Rolling speed per player (3-frame window) using transform (safe)
df['s_rolling_3'] = df.groupby(['game_id','play_id','nfl_id'])['s'].transform(lambda x: x.rolling(3, min_periods=1).mean())

# Distance to ball landing point
df['distance_to_ball'] = ((df['x_input'] - df['ball_land_x'])**2 + (df['y_input'] - df['ball_land_y'])**2)**0.5

# Score difference before play
df['score_difference'] = df['pre_snap_home_score'] - df['pre_snap_visitor_score']

# -------------------------------
# 7. EDA / Visualization & Save Plots
# -------------------------------

# 7a. Histogram of player speed
plt.figure(figsize=(10,6))
sns.histplot(df['s'].dropna(), bins=30, kde=False)
plt.title("Player Speed Distribution")
plt.xlabel("Speed (yards/frame)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(os.path.join(output_plot_dir, "hist_speed.png"), dpi=150)
plt.close()

# 7b. Scatter plot: player position vs distance to ball landing
plt.figure(figsize=(10,8))
sample_df = df.dropna(subset=['x_input','y_input','distance_to_ball']).sample(min(5000, len(df)), random_state=42)
sns.scatterplot(x='x_input', y='y_input', hue='distance_to_ball', palette='viridis', data=sample_df, alpha=0.6, legend=False)
plt.title("Player Position vs Distance to Ball Landing (Sample 5000)")
plt.xlabel("X Position")
plt.ylabel("Y Position")
plt.tight_layout()
plt.savefig(os.path.join(output_plot_dir, "scatter_position_distance.png"), dpi=150)
plt.close()

# 7c. Average distance to ball per week
weekly_distance = df.dropna(subset=['distance_to_ball']).groupby('week')['distance_to_ball'].mean().reset_index()
plt.figure(figsize=(10,6))
sns.lineplot(x='week', y='distance_to_ball', data=weekly_distance, marker='o')
plt.title("Average Player Distance to Ball Landing per Week")
plt.xlabel("Week")
plt.ylabel("Average Distance")
plt.tight_layout()
plt.savefig(os.path.join(output_plot_dir, "line_avg_distance_week.png"), dpi=150)
plt.close()

print(f"Plots saved in folder: {output_plot_dir}")

# -------------------------------
# 8. Save cleaned dataset ready for modeling
# -------------------------------
df.to_csv("merged_cleaned_features.csv", index=False)
print("Dataset saved to 'merged_cleaned_features.csv'")
