# 📘 Titanic Logistic Regression (CSV) + EDA — v3
**Note:** The `Survived` target should not be NaN. If present in other datasets, we drop NaN targets.


## Setup & Data Loading

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('titanic.csv')
print(df.shape)
df.head()


## EDA — Structure, Missingness, Class Balance

In [None]:
df.info()

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df['Survived'].value_counts(normalize=True).rename('proportion')

## EDA — Distributions & Survival Rates

In [None]:

plt.figure(figsize=(6,4))
df['Age'].plot(kind='hist', bins=30)
plt.xlabel('Age'); plt.title('Age Distribution'); plt.show()


In [None]:

pclass_rate = df.groupby('Pclass')['Survived'].mean()
plt.figure(figsize=(6,4)); pclass_rate.plot(kind='bar'); plt.ylabel('Survival Rate'); plt.title('Survival Rate by Pclass'); plt.ylim(0,1); plt.show()


In [None]:

sex_rate = df.groupby('Sex')['Survived'].mean().sort_values(ascending=False)
plt.figure(figsize=(6,4)); sex_rate.plot(kind='bar'); plt.ylabel('Survival Rate'); plt.title('Survival Rate by Sex'); plt.ylim(0,1); plt.show()


## Target & Features (with target dropna safety)

In [None]:

cols = ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
data = df[cols].copy()
data['Alone'] = ((data['SibSp'].fillna(0) + data['Parch'].fillna(0)) == 0).astype(int)

# Drop rows where the TARGET is NaN (safety; typically not needed for Kaggle Titanic)
data = data.dropna(subset=['Survived']).reset_index(drop=True)

y = data['Survived']
X = data.drop(columns=['Survived'])

num_features = ['Pclass','Age','SibSp','Parch','Fare']
cat_features = ['Sex','Embarked','Alone']


## Preprocessing + Train/Test

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocess = ColumnTransformer([('num', numeric_transformer, num_features), ('cat', categorical_transformer, cat_features)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Train & Evaluate

In [None]:

model = Pipeline([('preprocess', preprocess), ('clf', LogisticRegression(max_iter=1000))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

metrics = {
    'accuracy': round(accuracy_score(y_test, y_pred),3),
    'precision': round(precision_score(y_test, y_pred),3),
    'recall': round(recall_score(y_test, y_pred),3),
    'f1': round(f1_score(y_test, y_pred),3),
    'roc_auc': round(roc_auc_score(y_test, y_proba),3)
}
metrics


## Confusion Matrix & ROC Curve

In [None]:

cm = confusion_matrix(y_test, y_pred)
import pandas as pd
pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Pred 0','Pred 1'])


In [None]:

try:
    from sklearn.metrics import RocCurveDisplay
    RocCurveDisplay.from_estimator(model, X_test, y_test)
    plt.title('ROC Curve'); plt.show()
except Exception as e:
    print("ROC curve not available in this sklearn version:", e)


## Coefficients (Odds Interpretation)

In [None]:

ct = model.named_steps['preprocess']
ohe = ct.named_transformers_['cat'].named_steps['onehot']
num_names = num_features
cat_names = list(ohe.get_feature_names_out(cat_features))
all_feature_names = num_names + cat_names

coef = model.named_steps['clf'].coef_[0]
coef_df = pd.DataFrame({'feature': all_feature_names, 'coef': coef})
coef_df['odds_ratio'] = np.exp(coef_df['coef'])
coef_df.sort_values('odds_ratio', ascending=False).head(12)
