# Titanic Survival Prediction

## 1. Import Libraries and Load Data

First, let's import the necessary libraries and load our training and testing datasets.

In [26]:
# Imports + data loading
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# meta-learner & scaling for stacking
from sklearn.preprocessing import StandardScaler

train_data = pd.read_csv("train.csv")
test_data  = pd.read_csv("test.csv")

train_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 2. Exploratory Data Analysis (EDA)

Now, let's explore the data to understand its structure, find patterns, and identify missing values.

In [27]:
# Strong Titanic feature engineering (safe, no chained assignment)

def extract_title(name):
    if pd.isnull(name):
        return "Unknown"
    return name.split(",")[1].split(".")[0].strip()

def build_features(df):
    df = df.copy()

    # Titles
    df["Title"] = df["Name"].apply(extract_title)
    df["Title"] = df["Title"].replace(
        ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"], "Rare"
    ).replace({"Mlle":"Miss","Ms":"Miss","Mme":"Mrs"})

    # Family
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"]    = (df["FamilySize"] == 1).astype(int)

    # Flags / interactions
    df["CabinKnown"] = df["Cabin"].notna().astype(int)
    df["TicketGroupSize"] = df.groupby("Ticket")["Ticket"].transform("count")

    # Imputations (aligned indices)
    embarked_mode = df["Embarked"].mode().iloc[0]
    df["Embarked"] = df["Embarked"].fillna(embarked_mode)
    df["Fare"] = df["Fare"].fillna(df.groupby("Pclass")["Fare"].transform("median"))
    df["Age"]  = df["Age"].fillna(df.groupby(["Title","Sex","Pclass"])["Age"].transform("median"))

    # Bins helpful for linear models
    df["AgeBin"]  = pd.cut(df["Age"],  bins=[0,12,16,21,30,40,50,60,80], include_lowest=True)
    df["FareBin"] = pd.qcut(df["Fare"], q=8, duplicates="drop")

    # Domain signals
    df["IsChild"]  = (df["Age"] < 16).astype(int)
    df["IsMother"] = ((df["Sex"]=="female") & (df["Parch"]>0) & (df["Title"].isin(["Mrs","Rare"]))).astype(int)
    df["Sex_Pclass"] = df["Sex"].astype(str) + "_" + df["Pclass"].astype(str)

    return df

train_fe = build_features(train_data)
test_fe  = build_features(test_data)

drop_cols = ["PassengerId","Name","Ticket","Cabin"]
X = train_fe.drop(columns=drop_cols + ["Survived"])
y = train_fe["Survived"]
X_test_final = test_fe.drop(columns=drop_cols)

num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()

X.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,CabinKnown,TicketGroupSize,AgeBin,FareBin,IsChild,IsMother,Sex_Pclass
0,3,male,22.0,1,0,7.25,S,Mr,2,0,0,1,"(21.0, 30.0]","(-0.001, 7.75]",0,0,male_3
1,1,female,38.0,1,0,71.2833,C,Mrs,2,0,1,1,"(30.0, 40.0]","(69.488, 512.329]",0,0,female_1
2,3,female,26.0,0,0,7.925,S,Miss,1,1,0,1,"(21.0, 30.0]","(7.91, 9.841]",0,0,female_3
3,1,female,35.0,1,0,53.1,S,Mrs,2,0,1,2,"(30.0, 40.0]","(31.0, 69.488]",0,0,female_1
4,3,male,35.0,0,0,8.05,S,Mr,1,1,0,1,"(30.0, 40.0]","(7.91, 9.841]",0,0,male_3


## 3. Data Cleaning & Feature Engineering

Based on our EDA, we'll clean the data by handling missing values and create new features to improve our model's performance.

In [28]:
# Train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessor (dense output for tree/gb; LR handles dense fine)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ]
)

# Pipelines (hyperparams choisis pour bon compromis perf/temps)
lr_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", LogisticRegression(max_iter=2000, solver="liblinear", random_state=42))
])

gb_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", GradientBoostingClassifier(
        n_estimators=250, learning_rate=0.06, max_depth=3, subsample=0.9, random_state=42))
])

rf_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=500, max_depth=8, min_samples_split=4, min_samples_leaf=2,
        random_state=42, n_jobs=-1))
])

dt_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", DecisionTreeClassifier(
        max_depth=5, min_samples_split=8, min_samples_leaf=3, random_state=42))
])

# Fit base learners on train
lr_pipe.fit(X_train, y_train)
gb_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)
dt_pipe.fit(X_train, y_train)

# Validation predictions & probabilities
pred_lr_val  = lr_pipe.predict(X_valid)
pred_gb_val  = gb_pipe.predict(X_valid)
pred_rf_val  = rf_pipe.predict(X_valid)
pred_dt_val  = dt_pipe.predict(X_valid)

proba_lr_val = lr_pipe.predict_proba(X_valid)[:,1]
proba_gb_val = gb_pipe.predict_proba(X_valid)[:,1]
proba_rf_val = rf_pipe.predict_proba(X_valid)[:,1]
proba_dt_val = dt_pipe.predict_proba(X_valid)[:,1]

print("Valid acc LR:", round(accuracy_score(y_valid, pred_lr_val),4))
print("Valid acc GB:", round(accuracy_score(y_valid, pred_gb_val),4))
print("Valid acc RF:", round(accuracy_score(y_valid, pred_rf_val),4))
print("Valid acc DT:", round(accuracy_score(y_valid, pred_dt_val),4))


Valid acc LR: 0.8212
Valid acc GB: 0.7933
Valid acc RF: 0.8101
Valid acc DT: 0.8045


## 4. Model Training and Evaluation

It's time to choose a model, train it on our processed data, and see how well it performs.

In [29]:
# Build meta-features from validation probabilities
Z_valid = np.vstack([proba_lr_val, proba_gb_val, proba_rf_val, proba_dt_val]).T

# Scale meta-features (often helps meta-logistic)
meta_scaler = StandardScaler()
Z_valid_std = meta_scaler.fit_transform(Z_valid)

meta_lr = LogisticRegression(max_iter=2000, solver="lbfgs", random_state=42)
meta_lr.fit(Z_valid_std, y_valid)

# Evaluate stacked model on validation
stack_val_pred = meta_lr.predict(Z_valid_std)
stack_val_acc = accuracy_score(y_valid, stack_val_pred)
print("Stacked validation accuracy:", round(stack_val_acc,4))


Stacked validation accuracy: 0.8212


## 5. Create Submission File

Finally, we'll use our trained model to make predictions on the test set and generate the submission file in the required format.

In [30]:
# Retrain base learners on FULL data
lr_pipe.fit(X, y)
gb_pipe.fit(X, y)
rf_pipe.fit(X, y)
dt_pipe.fit(X, y)

# Test probabilities
proba_lr_test = lr_pipe.predict_proba(X_test_final)[:,1]
proba_gb_test = gb_pipe.predict_proba(X_test_final)[:,1]
proba_rf_test = rf_pipe.predict_proba(X_test_final)[:,1]
proba_dt_test = dt_pipe.predict_proba(X_test_final)[:,1]

Z_test = np.vstack([proba_lr_test, proba_gb_test, proba_rf_test, proba_dt_test]).T
Z_test_std = meta_scaler.transform(Z_test)

final_preds = meta_lr.predict(Z_test_std)

submission = pd.DataFrame({
    "PassengerId": test_fe["PassengerId"],
    "Survived": final_preds
})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved. shape:", submission.shape)
submission.head()


submission.csv saved. shape: (418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
