In [1]:
# STEP 5 — Submission File Creation
import pandas as pd
from pathlib import Path
import joblib

# --- 1) Define paths ---
PROC_DIR = Path("../data/processed")
OUT_DIR = Path("../outputs/submissions")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- 2) Load processed test set and saved model ---
test_df = pd.read_csv(PROC_DIR / "test_processed.csv")
model_path = Path("../outputs/random_forest_model.pkl")

if not model_path.exists():
    raise FileNotFoundError(
        f"❌ Model not found at {model_path}. Please run Step 3 (training) first."
    )

model = joblib.load(model_path)
print(f"✅ Loaded model from: {model_path}")

# --- 3) Predict on test data ---
# Ensure same feature alignment as during training
test_features = test_df.copy()

predictions = model.predict(test_features)
print(f"✅ Predictions generated for {len(predictions)} passengers")

# --- 4) Prepare submission DataFrame ---
# Always pull PassengerId from the *raw* test file to preserve order
raw_test = pd.read_csv("../data/raw/test.csv")
submission = pd.DataFrame({
    "PassengerId": raw_test["PassengerId"],
    "Survived": predictions.astype(int)
})

# --- 5) Save submission file ---
submission_path = OUT_DIR / "titanic_submission.csv"
submission.to_csv(submission_path, index=False)

print(f"🚀 Submission file created: {submission_path.resolve()}")
print(submission.head())


✅ Loaded model from: ..\outputs\random_forest_model.pkl
✅ Predictions generated for 418 passengers
🚀 Submission file created: C:\Users\nicho\Documents\Kaggle\titanic\outputs\submissions\titanic_submission.csv
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0
