# Titanic – Clean End‑to‑End ML Workflow  
  
This notebook rebuilds the **entire** Titanic Kaggle pipeline from scratch:

1. **Load data**  
2. **Pre‑processing / feature engineering** (wrapped in a reusable class)  
3. **Model training + 5‑fold cross‑validation**  
4. **Fit final model and generate `submission.csv`**  

All steps are fully reproducible and aligned between train & test – no missing columns, no dtype errors.

In [None]:
import pandas as pd
import numpy as np
import re, warnings, os
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')

TRAIN_CSV = 'train.csv'
TEST_CSV  = 'test.csv'

train_raw = pd.read_csv(TRAIN_CSV)
test_raw  = pd.read_csv(TEST_CSV)

print(f'train shape: {train_raw.shape} | test shape: {test_raw.shape}')


## Pre‑processing helper

In [None]:
class TitanicPreprocessor:
    """Fit once on train, apply to train & test, ensures aligned features."""
    def __init__(self, n_ticket_prefix=3):
        self.age_median = None
        self.age_bins = None
        self.fare_median = None
        self.fare_bins = None
        self.top_prefixes = None
        self.deck_categories = list('ABCDEF') + ['O','U']
        self.embarked_modes = None
        self.n_ticket_prefix = n_ticket_prefix
        self.feature_columns_ = None

    # ------------------------------------------------------------------
    def _extract_title(self, s):
        match = re.search(r',\s*([^\.]+)\.', s)
        return match.group(1).strip() if match else 'Rare'

    # ------------------------------------------------------------------
    def fit(self, df: pd.DataFrame):
        # --- Age ---
        self.age_median = df['Age'].median()
        self.age_bins = [0,12,20,40,60,80, np.inf]

        # --- Fare ---
        self.fare_median = df['Fare'].median()
        # 5‑quantile edges:
        self.fare_bins  = list(df['Fare'].quantile([0, .2, .4, .6, .8, 1]))
        self.fare_bins[-1] += 1  # ensure inclusive

        # --- Ticket prefixes ---
        prefixes = df['Ticket'].fillna('').str.split().str[:-1].str.join(' ')
        self.top_prefixes = prefixes.value_counts().nlargest(self.n_ticket_prefix).index.tolist()

        # --- Embarked mode ---
        self.embarked_mode = df['Embarked'].mode().iloc[0]

        # generate train feature matrix to lock column order
        X = self.transform(df, fit_stage=True)
        self.feature_columns_ = X.columns.tolist()
        return self

    # ------------------------------------------------------------------
    def transform(self, df: pd.DataFrame, fit_stage=False):
        out = pd.DataFrame(index=df.index)

        # -------- Basic numeric/boolean ----------
        out['Pclass'] = df['Pclass']
        out['Sex']    = df['Sex'].map({'male':0,'female':1}).astype(int)
        out['SibSp']  = df['SibSp']
        out['Parch']  = df['Parch']

        # -------- Family features -------------
        fam_size = df['SibSp'] + df['Parch'] + 1
        out['FamilySize'] = fam_size
        out['IsAlone']    = (fam_size==1).astype(int)

        # -------- Age -------------
        age = df['Age'].fillna(self.age_median)
        out['AgeBin'] = pd.cut(age, bins=self.age_bins, labels=False, right=False).astype(int)

        # -------- Fare ------------
        fare = df['Fare'].fillna(self.fare_median)
        out['FareBin'] = pd.cut(fare, bins=self.fare_bins, labels=False, right=False).astype(int)

        # -------- Cabin / Deck ------------
        has_cabin = df['Cabin'].notna().astype(int)
        deck = df['Cabin'].fillna('U').str[0]
        deck = deck.replace({d:'O' for d,c in deck.value_counts().items() if c<10 and d not in ['U']})
        out['HasCabin'] = has_cabin

        deck_dummies = pd.get_dummies(deck).reindex(columns=self.deck_categories, fill_value=0)
        deck_dummies.columns = [f'Deck_{c}' for c in deck_dummies.columns]
        out = pd.concat([out, deck_dummies], axis=1)

        # -------- Ticket features -----------
        ticket = df['Ticket'].fillna('')
        ticket_parts = ticket.str.split()
        prefix = ticket_parts.str[:-1].str.join(' ')
        number = ticket_parts.str[-1].where(ticket_parts.str[-1].str.isnumeric(), '0').astype(int)

        out['HasTicketPrefix'] = (prefix!='').astype(int)
        out['TicketNumber']    = number
        # Ticket num bin
        out['TicketNum_qbin']  = pd.qcut(number.rank(method='first'), 10, labels=False)
        # Group size
        group_size = df.groupby('Ticket')['PassengerId'].transform('count')
        out['TicketGroupSize'] = group_size
        out['IsGroupTicket']   = (group_size>1).astype(int)

        # top‑N prefix one‑hots
        prefix_reduced = prefix.where(prefix.isin(self.top_prefixes),'Other')
        prefix_dummies = pd.get_dummies(prefix_reduced, prefix='TktPre')
        out = pd.concat([out, prefix_dummies], axis=1)

        # -------- Embarked -----------
        embarked = df['Embarked'].fillna(self.embarked_mode)
        embarked_dum = pd.get_dummies(embarked, prefix='Emb')
        out = pd.concat([out, embarked_dum], axis=1)

        # -------- Title (optional) ----------
        title = df['Name'].apply(self._extract_title)
        title_map = {t:i for i,t in enumerate(title.unique(),0)}
        out['Title'] = title.map(title_map).astype(int)

        # -------- Align columns order during inference -------------
        if not fit_stage:
            missing_cols = [c for c in self.feature_columns_ if c not in out.columns]
            for c in missing_cols:
                out[c] = 0
            out = out[self.feature_columns_]
        return out


## Fit pre‑processor on train and transform train/test

In [None]:
prep = TitanicPreprocessor()
prep.fit(train_raw)

X_train = prep.transform(train_raw)
y_train = train_raw['Survived']
X_test  = prep.transform(test_raw)

print('Feature matrix shape:', X_train.shape)


## 5‑fold Cross‑Validation accuracy

In [None]:
xgb_params = dict(
    random_state=42,
    eval_metric='logloss',
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8
)
model = XGBClassifier(**xgb_params)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print(f'5‑fold CV accuracy: {scores.mean():.4f} ± {scores.std():.4f}')


## Train final model on full train data

In [None]:
model.fit(X_train, y_train)


## Predict test set & build `submission.csv`

In [None]:
test_raw['Survived'] = model.predict(X_test).astype(int)
submission = test_raw[['PassengerId','Survived']]
submission.to_csv('submission.csv', index=False)
print('Saved submission.csv with shape:', submission.shape)
