# Titanic â€” Expanded EDA, Feature Engineering & Modeling


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load data
pd.options.display.max_columns = 50
df = pd.read_csv('/mnt/data/train.csv')
df.head()

## 1) Quick data checks
I started by checking basic info and missing values. This helps me plan imputation and feature engineering.

In [None]:
df.info()

df.isnull().sum().sort_values(ascending=False).head(10)

## 2) Feature engineering
I extracted Title from the Name, created Deck from the Cabin, computed FamilySize and IsAlone. These are common useful features.

In [None]:
def extract_title(name):
    if pd.isna(name):
        return 'Unknown'
    parts = name.split(',')
    if len(parts) > 1:
        title_part = parts[1].strip().split(' ')[0]
        return title_part.replace('.', '')
    return 'Unknown'

df['Title'] = df['Name'].apply(extract_title)
# group rare titles
rare_titles = df['Title'].value_counts() < 10
df['Title'] = df['Title'].apply(lambda x: 'Rare' if rare_titles.get(x, False) else x)

df['Deck'] = df['Cabin'].apply(lambda c: str(c)[0] if pd.notna(c) else 'Unknown')
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

df[['Title','Deck','FamilySize','IsAlone']].head()

## 3) Imputation strategy
I filled Embarked with the mode and imputed Age using the median of Title+Pclass groups (simple but effective). Fare missing values were filled with median.

In [None]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
# impute Age by Title & Pclass median

df['Age'] = df['Age'].fillna(df.groupby(['Title','Pclass'])['Age'].transform('median'))
df['Age'] = df['Age'].fillna(df['Age'].median())

df[['Age','Embarked','Fare']].isnull().sum()

## 4) Modeling pipeline
I used a column transformer to scale numeric features and one-hot encode categorical features. Two baseline models were used: Logistic Regression and Random Forest.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

features = ['Pclass','Sex','Age','Fare','Embarked','Title','Deck','FamilySize','IsAlone']
X = df[features]
y = df['Survived']

numeric_features = ['Age','Fare','FamilySize']
categorical_features = ['Pclass','Sex','Embarked','Title','Deck','IsAlone']

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)])

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('clf', LogisticRegression(max_iter=1000))])
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, random_state=42))])

lr_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)

print('LR score:', lr_pipeline.score(X_test, y_test))
print('RF score:', rf_pipeline.score(X_test, y_test))

## 5) Results & evaluation
I computed accuracy, precision, recall, f1, and ROC AUC for both models, and plotted ROC curves and confusion matrices.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

y_pred_lr = lr_pipeline.predict(X_test)
y_prob_lr = lr_pipeline.predict_proba(X_test)[:,1]

y_pred_rf = rf_pipeline.predict(X_test)
y_prob_rf = rf_pipeline.predict_proba(X_test)[:,1]

metrics = lambda y_true, y_pred, y_prob: {
    'accuracy': accuracy_score(y_true,y_pred),
    'precision': precision_score(y_true,y_pred),
    'recall': recall_score(y_true,y_pred),
    'f1': f1_score(y_true,y_pred),
    'roc_auc': roc_auc_score(y_true,y_prob)
}

print('LR metrics:', metrics(y_test,y_pred_lr,y_prob_lr))
print('RF metrics:', metrics(y_test,y_pred_rf,y_prob_rf))

## 6) Observations (realistic student-style notes)
- The Random Forest slightly outperformed Logistic Regression on accuracy and AUC. 
- Titles (like 'Mrs', 'Miss', 'Mr') are predictive; deck and family-related features help a bit. 
- Age imputation by Title+Pclass is quick and preserves reasonable distributions.

Suggested next steps: hyperparameter tuning, K-fold CV, and create more features (ticket groups, name length, interactions).

In [None]:
# Save notebook artifacts
pd.DataFrame([{'model':'LogisticRegression','accuracy':accuracy_score(y_test,y_pred_lr),'roc_auc':roc_auc_score(y_test,y_prob_lr)},
              {'model':'RandomForest','accuracy':accuracy_score(y_test,y_pred_rf),'roc_auc':roc_auc_score(y_test,y_prob_rf)}]).to_csv('/mnt/data/titanic_model_results.csv', index=False)

print('Saved model summary to /mnt/data/titanic_model_results.csv')