# Assignment 1 — Titanic (Binary Classification)

**Dataset**: Kaggle *Titanic — Machine Learning from Disaster*
**Target**: `Survived` (0=not survived, 1=survived)
**Goal Metrics**: Accuracy, F1(macro), ROC-AUC

Tips:
- For the baseline, drop obvious identifier columns such as `PassengerId`, `Name`, `Ticket`, `Cabin`.
- Keep the workflow simple and reproducible; set a fixed `random_state`.


In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
RANDOM_STATE = 42
ARTIFACTS = Path('../artifacts'); ARTIFACTS.mkdir(parents=True, exist_ok=True)
RESULTS = Path('../results'); RESULTS.mkdir(parents=True, exist_ok=True)
print('Setup complete')


## 1) Load Data
**Option A (manual upload)**: In Colab → `Files` panel → Upload `train.csv`.

**Option B (Kaggle API)**:
```
!pip install kaggle -q
from google.colab import files; files.upload()   # upload kaggle.json (do not commit it)
!mkdir -p ~/.kaggle && cp kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c titanic -p /content
!unzip -o /content/titanic.zip -d /content
```


In [None]:
# If you uploaded `train.csv` to /content
df = pd.read_csv('/content/train.csv')  # adjust path if needed
df.head()


## 2) Basic EDA

In [None]:
print(df.shape)
print(df.dtypes)
print(df.isna().sum().sort_values(ascending=False).head(10))
df['Survived'].value_counts(normalize=True).plot(kind='bar', title='Target Balance'); plt.show()


## 3) Train/Dev/Test Split (with simple cleaning)

In [None]:
TARGET = 'Survived'
X = df.drop(columns=[TARGET, 'PassengerId','Name','Ticket','Cabin'])
y = df[TARGET]
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
X_dev, X_test, y_dev, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp
)
X_train.shape, X_dev.shape, X_test.shape


## 4) Preprocessing (ColumnTransformer) + Models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

num_cols = X_train.select_dtypes(include=['int64','float64']).columns
cat_cols = X_train.select_dtypes(include=['object','category','bool']).columns

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False))
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

log_reg = Pipeline([('prep', preprocess), ('model', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))])
rf_clf  = Pipeline([('prep', preprocess), ('model', RandomForestClassifier(n_estimators=400, random_state=RANDOM_STATE))])
log_reg, rf_clf


## 5) Evaluate on Dev Set

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, ConfusionMatrixDisplay, RocCurveDisplay

models = {'log_reg': log_reg, 'rf': rf_clf}
scores = {}
for name, mdl in models.items():
    mdl.fit(X_train, y_train)
    dev_pred = mdl.predict(X_dev)
    dev_proba = mdl.predict_proba(X_dev)[:,1] if hasattr(mdl, 'predict_proba') else None
    acc = accuracy_score(y_dev, dev_pred)
    f1m = f1_score(y_dev, dev_pred, average='macro')
    roc = roc_auc_score(y_dev, dev_proba) if dev_proba is not None else float('nan')
    scores[name] = {'accuracy':acc, 'f1_macro':f1m, 'roc_auc':roc}
    print(name, scores[name])

# choose best by F1(macro)
best_name = max(scores, key=lambda k: scores[k]['f1_macro'])
best_model = models[best_name]
print('Best model (dev):', best_name, scores[best_name])

# Confusion matrix on dev
ConfusionMatrixDisplay.from_estimator(best_model, X_dev, y_dev)
plt.title(f'Confusion Matrix (dev) — {best_name}')
plt.tight_layout(); plt.savefig(ARTIFACTS/'confusion_matrix_dev.png'); plt.show()

if hasattr(best_model, 'predict_proba'):
    RocCurveDisplay.from_estimator(best_model, X_dev, y_dev)
    plt.title(f'ROC Curve (dev) — {best_name}')
    plt.tight_layout(); plt.savefig(ARTIFACTS/'roc_curve_dev.png'); plt.show()

import pandas as pd
pd.DataFrame(scores).T.to_csv(RESULTS/'dev_scores.csv')


## 6) Retrain on Train+Dev and Evaluate on Test

In [None]:
X_trd = pd.concat([X_train, X_dev], axis=0)
y_trd = pd.concat([y_train, y_dev], axis=0)
best_model.fit(X_trd, y_trd)
test_pred = best_model.predict(X_test)
test_proba = best_model.predict_proba(X_test)[:,1] if hasattr(best_model, 'predict_proba') else None
acc = accuracy_score(y_test, test_pred)
f1m = f1_score(y_test, test_pred, average='macro')
roc = roc_auc_score(y_test, test_proba) if test_proba is not None else float('nan')
summary = {'best_model': best_name, 'test_accuracy': acc, 'test_f1_macro': f1m, 'test_roc_auc': roc}
print(summary)
pd.DataFrame([summary]).to_csv(RESULTS/'test_summary.csv', index=False)


## 7) Wrap-up Notes for README

In [None]:
print('Suggested README bullets:')
print('- Problem: Predict survival on Titanic using tabular features')
print('- Method: Simple preprocessing + baseline Logistic Regression vs RandomForest')
print('- Best model (dev):', best_name, '→ see results/dev_scores.csv')
print('- Final test metrics: see results/test_summary.csv')
print('- Artifacts saved in artifacts/ (confusion matrix, ROC)')
