# Kaggle Titanic Baseline\n\nThis notebook mirrors the baseline pipeline in `src/train.py` and produces a `submission.csv` file for the Kaggle Titanic competition.\n\n**Before you start:** place `train.csv` and `test.csv` inside the `data/` directory.

In [None]:
from pathlib import Path

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

## Load the data

In [None]:
data_dir = Path("data")
train_path = data_dir / "train.csv"
test_path = data_dir / "test.csv"

if not train_path.exists():
    raise FileNotFoundError("Missing data/train.csv. Download it from Kaggle.")
if not test_path.exists():
    raise FileNotFoundError("Missing data/test.csv. Download it from Kaggle.")

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df.head()

## Build the preprocessing + model pipeline

In [None]:
feature_cols = [
    "Pclass",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked",
]

numeric_features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
categorical_features = ["Sex", "Embarked"]

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

model = LogisticRegression(max_iter=1000)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model),
    ]
)

## Train the model

In [None]:
x_train = train_df[feature_cols]
y_train = train_df["Survived"]
x_test = test_df[feature_cols]

pipeline.fit(x_train, y_train)

## Generate a submission file

In [None]:
predictions = pipeline.predict(x_test)

submission = pd.DataFrame(
    {"PassengerId": test_df["PassengerId"], "Survived": predictions}
)

output_dir = Path("output")
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "submission.csv"
submission.to_csv(output_path, index=False)

output_path