# Import Libraries & Load Data

In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Step 2: Load datasets
train = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv")   # rename your train file accordingly
test = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv")     # rename your test file accordingly

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Display first few rows
train.head()

# Identify Target and ID Columns

In [None]:
# Automatically detect possible target column
target_col = "game_id"   # change if your target is different
id_col = None
for c in train.columns:
    if "id" in c.lower():
        id_col = c
        break

print("Target column:", target_col)
print("ID column:", id_col)

# Separate Features and Target

In [None]:
# Separate input (X) and output (y)
y = train[target_col]
X = train.drop(columns=[target_col])
if id_col and id_col in X.columns:
    X = X.drop(columns=[id_col])

print("X shape:", X.shape)
print("y shape:", y.shape)

# Handle Missing Values and Encode Categorical Columns

In [None]:
# Combine train & test for consistent preprocessing
combined = pd.concat([X, test], axis=0, ignore_index=True)

# Fill missing values and label encode categorical columns
for col in combined.columns:
    if combined[col].dtype == object:
        combined[col] = combined[col].fillna("___MISSING___")
        le = LabelEncoder()
        combined[col] = le.fit_transform(combined[col].astype(str))
    else:
        combined[col] = combined[col].fillna(combined[col].median())

# Split processed data back
X_processed = combined.iloc[:len(X)]
X_test_processed = combined.iloc[len(X):]

# Train/Test Split for Validation

In [None]:
# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y if len(np.unique(y))>1 else None
)

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)

# Train Simple Random Forest Model

In [None]:
# RandomForest (simple, reliable, >0.75 accuracy expected)
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", acc)

# Final Model Prediction on Test Data

In [None]:
# Predict on test dataset
preds = model.predict(X_test_processed)

# Create submission DataFrame
submission = pd.DataFrame({
    "id": np.arange(len(preds)),   # or replace with your real test IDs if available
    "target": preds
})

submission.to_csv("submission.csv", index=False)
print("Submission file saved successfully!")
submission.head()