In [2]:
# %% [markdown]
# Titanic — Logistic Regression (Notebook)
# 
# Author: Maddy
# Deliverable: Single .ipynb file covering EDA, preprocessing, model training, evaluation, interpretation, and saving artifacts.

# %%
# 1. Imports
import warnings
warnings.filterwarnings('ignore')

import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, roc_curve, confusion_matrix, classification_report)
import joblib

# %%
# 2. File paths (adjust if needed)
POSSIBLE_TRAIN = [
    "/mnt/data/Titanic_train.csv",
    "Titanic_train.csv",
    r"D:\DATA SCIENCE\ASSIGNMENTS\7 logistic regression\files\Logistic Regression\Titanic_train.csv",
]
POSSIBLE_TEST = [
    "/mnt/data/Titanic_test.csv",
    "Titanic_test.csv",
    r"D:\DATA SCIENCE\ASSIGNMENTS\7 logistic regression\files\Logistic Regression\Titanic_test.csv",
]

TRAIN_PATH = next((p for p in POSSIBLE_TRAIN if Path(p).exists()), None)
TEST_PATH = next((p for p in POSSIBLE_TEST if Path(p).exists()), None)

if TRAIN_PATH is None:
    raise FileNotFoundError("Couldn't find Titanic_train.csv. Place it in the working folder or update TRAIN_PATH list.")

print('Using train:', TRAIN_PATH)
print('Using test :', TEST_PATH)

# Create plots folder
PLOT_DIR = Path("plots")
PLOT_DIR.mkdir(parents=True, exist_ok=True)

# %%
# 3. Load data & quick EDA
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH) if TEST_PATH else None

print('\nTrain shape:', train.shape)
print(train.head(3).to_string(index=False))
print('\nMissing values (train):')
print(train.isnull().sum())

# Quick distributions
for col in ['Age','Fare']:
    if col in train.columns:
        plt.figure(figsize=(6,3))
        sns.histplot(train[col].dropna(), kde=True)
        plt.title(col)
        plt.tight_layout()
        plt.savefig(PLOT_DIR/f"hist_{col}.png")
        plt.close()

# %%
# 4. Feature engineering & preprocessing helpers

def extract_title(name):
    if pd.isna(name):
        return 'Unknown'
    parts = name.split(',')
    if len(parts) > 1:
        token = parts[1].split('.')[0].strip()
        return token
    return 'Unknown'


def prepare_features(df):
    df = df.copy()
    if 'Name' in df.columns:
        df['Title'] = df['Name'].apply(extract_title)
        df['Title'] = df['Title'].replace(['Mlle','Ms','Mme'], ['Miss','Miss','Mrs'])
        rare = ['Lady','the Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
        df['Title'] = df['Title'].replace(rare, 'Rare')
    if 'Cabin' in df.columns:
        df['HasCabin'] = df['Cabin'].notnull().astype(int)
    if 'Embarked' in df.columns:
        df['Embarked'] = df['Embarked'].fillna('Missing')
    return df

train = prepare_features(train)
if test is not None:
    test = prepare_features(test)

# %%
# 5. Define X, y, feature groups
TARGET = 'Survived'
drop_cols = ['PassengerId','Ticket','Cabin','Name']

if TARGET in train.columns:
    X = train.drop(columns=[TARGET] + [c for c in drop_cols if c in train.columns])
    y = train[TARGET]
else:
    X = train.drop(columns=[c for c in drop_cols if c in train.columns])
    y = None

X_test_file = None
y_test_file = None
if test is not None:
    if 'Survived' in test.columns:
        X_test_file = test.drop(columns=[TARGET] + [c for c in drop_cols if c in test.columns])
        y_test_file = test[TARGET]
    else:
        X_test_file = test.drop(columns=[c for c in drop_cols if c in test.columns])

numeric_features = [c for c in ['Age','SibSp','Parch','Fare'] if c in X.columns]
categorical_features = [c for c in ['Pclass','Sex','Embarked','Title','HasCabin'] if c in X.columns]

print('\nNumeric features:', numeric_features)
print('Categorical features:', categorical_features)

# %%
# 6. Build preprocessing + pipeline
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

ohe_features = [c for c in categorical_features if c != 'Pclass']
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                  ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

transformers = []
if numeric_features:
    transformers.append(('num', numeric_transformer, numeric_features))
if ohe_features:
    transformers.append(('ohe', cat_transformer, ohe_features))
if 'Pclass' in categorical_features:
    transformers.append(('pclass_passthrough', 'passthrough', ['Pclass']))

preprocessor = ColumnTransformer(transformers=transformers)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('clf', LogisticRegression(solver='liblinear', max_iter=1000, random_state=42))])

# %%
# 7. Train / validate
if X_test_file is None:
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    print('\nTrain/Valid shapes:', X_train.shape, X_valid.shape)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    y_proba = clf.predict_proba(X_valid)[:,1]
    eval_name = 'Validation'
    eval_y = y_valid
else:
    clf.fit(X, y)
    if y_test_file is not None:
        y_pred = clf.predict(X_test_file)
        y_proba = clf.predict_proba(X_test_file)[:,1]
        eval_name = 'Provided test file'
        eval_y = y_test_file
    else:
        # unlabeled test -> produce predictions only
        preds = clf.predict(X_test_file)
        out = X_test_file.copy()
        out['Survived'] = preds
        out.to_csv('test_predictions.csv', index=False)
        print('Saved test_predictions.csv')
        eval_name = None

# %%
# 8. Evaluation utilities
from sklearn.metrics import classification_report

def evaluate(name, y_true, y_pred, y_proba):
    print(f"\n=== Evaluation on {name} ===")
    print('Accuracy:', accuracy_score(y_true,y_pred))
    print('Precision:', precision_score(y_true,y_pred))
    print('Recall:', recall_score(y_true,y_pred))
    print('F1:', f1_score(y_true,y_pred))
    print('ROC-AUC:', roc_auc_score(y_true,y_proba))
    print('\nClassification report:')
    print(classification_report(y_true,y_pred))

    # confusion matrix and ROC
    cm = confusion_matrix(y_true,y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion matrix')
    plt.savefig(PLOT_DIR/f'confusion_{name}.png')
    plt.close()

    fpr, tpr, _ = roc_curve(y_true, y_proba)
    plt.figure(figsize=(5,4))
    plt.plot(fpr,tpr,label=f'AUC={roc_auc_score(y_true,y_proba):.3f}')
    plt.plot([0,1],[0,1],'--')
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.legend()
    plt.savefig(PLOT_DIR/f'roc_{name}.png')
    plt.close()

if eval_name:
    metrics = evaluate(eval_name, eval_y, y_pred, y_proba)

# %%
# 9. Cross-validation and feature importance
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X, y, scoring='roc_auc', cv=cv)
print('\n5-fold CV ROC-AUC:', cv_scores)
print('Mean CV AUC:', cv_scores.mean())

# feature names and coefficients

def get_feature_names(column_transformer):
    names = []
    for name, trans, cols in column_transformer.transformers_:
        if trans == 'passthrough':
            if isinstance(cols, (list,tuple)):
                names.extend(cols)
            else:
                names.append(cols)
            continue
        if hasattr(trans, 'named_steps'):
            last = list(trans.named_steps.items())[-1][1]
        else:
            last = trans
        if hasattr(last, 'get_feature_names_out'):
            try:
                out = last.get_feature_names_out(cols if isinstance(cols,(list,tuple)) else [cols])
                names.extend(out.tolist())
            except Exception:
                if isinstance(cols,(list,tuple)):
                    names.extend(cols)
                else:
                    names.append(cols)
        else:
            if isinstance(cols,(list,tuple)):
                names.extend(cols)
            else:
                names.append(cols)
    return names

feat_names = get_feature_names(clf.named_steps['preprocessor'])
coefs = clf.named_steps['clf'].coef_[0]
coef_df = pd.DataFrame({'feature': feat_names[:len(coefs)], 'coef': coefs})
coef_df['abs_coef'] = coef_df['coef'].abs()
coef_df = coef_df.sort_values('abs_coef', ascending=False)
print('\nTop coefficients:')
print(coef_df.head(15).to_string(index=False))

# %%
# 10. Save model and artifacts
joblib.dump(clf, 'titanic_logreg_pipeline.joblib')
print('\nSaved model to titanic_logreg_pipeline.joblib')

# Save coef table and metrics
coef_df.to_csv('logreg_coefficients.csv', index=False)
if eval_name:
    pd.Series({'cv_auc_mean': cv_scores.mean()}).to_csv('logreg_metrics_summary.csv')

# %% [markdown]
# End of notebook
# - Files produced: `titanic_logreg_pipeline.joblib`, `logreg_coefficients.csv`, `logreg_metrics_summary.csv` (if evaluation ran), and images in `plots/`.
# - To run app: place `model.joblib` (or `titanic_logreg_pipeline.joblib`) and `app.py` together and run `streamlit run app.py`.


Using train: D:\DATA SCIENCE\ASSIGNMENTS\7 logistic regression\files\Logistic Regression\Titanic_train.csv
Using test : D:\DATA SCIENCE\ASSIGNMENTS\7 logistic regression\files\Logistic Regression\Titanic_test.csv

Train shape: (891, 12)
 PassengerId  Survived  Pclass                                                Name    Sex  Age  SibSp  Parch           Ticket    Fare Cabin Embarked
           1         0       3                             Braund, Mr. Owen Harris   male 22.0      1      0        A/5 21171  7.2500   NaN        S
           2         1       1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38.0      1      0         PC 17599 71.2833   C85        C
           3         1       3                              Heikkinen, Miss. Laina female 26.0      0      0 STON/O2. 3101282  7.9250   NaN        S

Missing values (train):
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch    