# Titanic Survival Prediction

## 1. Import Libraries and Load Data

First, let's import the necessary libraries and load our training and testing datasets.

In [31]:
# Imports + data loading
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

train_data = pd.read_csv("train.csv")
test_data  = pd.read_csv("test.csv")

train_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 2. Exploratory Data Analysis (EDA)

Now, let's explore the data to understand its structure, find patterns, and identify missing values.

In [32]:
# Strong Titanic feature engineering (safe: no chained assignment, aligned indices)

def extract_title(name):
    if pd.isnull(name): return "Unknown"
    return name.split(",")[1].split(".")[0].strip()

def extract_lastname(name):
    if pd.isnull(name): return "Unknown"
    return name.split(",")[0].strip()

def build_features(df):
    d = df.copy()

    # Titles & LastName
    d["Title"] = d["Name"].apply(extract_title)
    d["Title"] = d["Title"].replace(
        ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"], "Rare"
    ).replace({"Mlle":"Miss","Ms":"Miss","Mme":"Mrs"})
    d["LastName"] = d["Name"].apply(extract_lastname)

    # Family structure
    d["FamilySize"] = d["SibSp"] + d["Parch"] + 1
    d["IsAlone"]    = (d["FamilySize"] == 1).astype(int)

    # FamilyID (truncate very small groups to avoid noise)
    d["FamilyID"] = (d["LastName"].astype(str) + "_" + d["FamilySize"].astype(str))
    # Keep FamilyID only for families with size >= 3, else 'Small'
    fam_counts = d["FamilyID"].value_counts()
    big_fams = set(fam_counts[fam_counts >= 3].index)
    d["FamilyID"] = d["FamilyID"].where(d["FamilyID"].isin(big_fams), "Small")

    # Cabin/Deck flag (many NaN in Cabin)
    d["CabinKnown"] = d["Cabin"].notna().astype(int)
    d["Deck"] = d["Cabin"].fillna("U").astype(str).str[0]  # A..G, T, or U(unknown)

    # Ticket group size (how many share same ticket)
    d["TicketGroupSize"] = d.groupby("Ticket")["Ticket"].transform("count")

    # Embarked (mode)
    embarked_mode = d["Embarked"].mode().iloc[0]
    d["Embarked"] = d["Embarked"].fillna(embarked_mode)

    # Fare imputation by (Pclass, Embarked) median
    fare_med = d.groupby(["Pclass","Embarked"])["Fare"].transform("median")
    d["Fare"] = d["Fare"].fillna(fare_med)
    # Fare per person (often helpful)
    d["FarePerPerson"] = (d["Fare"] / d["FamilySize"]).replace([np.inf, -np.inf], np.nan).fillna(d["Fare"])

    # Age imputation by (Title, Sex, Pclass)
    age_group_median = d.groupby(["Title","Sex","Pclass"])["Age"].transform("median")
    global_age_median = d["Age"].median()
    d["Age"] = d["Age"].fillna(age_group_median).fillna(global_age_median)

    # Bins (categorical buckets help LR & GB)
    d["AgeBin"]  = pd.cut(d["Age"],  bins=[0,12,16,21,30,40,50,60,80], include_lowest=True)
    d["FareBin"] = pd.qcut(d["Fare"], q=8, duplicates="drop")

    # Domain signals
    d["IsChild"]  = (d["Age"] < 16).astype(int)
    d["IsMother"] = ((d["Sex"]=="female") & (d["Parch"]>0) & (d["Title"].isin(["Mrs","Rare"]))).astype(int)
    d["Sex_Pclass"] = d["Sex"].astype(str) + "_" + d["Pclass"].astype(str)

    return d

train_fe = build_features(train_data)
test_fe  = build_features(test_data)

drop_cols = ["PassengerId","Name","Ticket","Cabin"]  # we keep LastName only via FamilyID
X = train_fe.drop(columns=drop_cols + ["Survived"])
y = train_fe["Survived"]
X_test_final = test_fe.drop(columns=drop_cols)

num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()

X.head()
# Train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessor (dense output for GB; OK for LR)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ]
)

# Gradient Boosting (params robustes pour Titanic)
gb_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=3, subsample=0.9, random_state=42))
])

# Logistic Regression baseline (utile pour un petit blend)
lr_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", LogisticRegression(max_iter=2000, solver="liblinear", random_state=42))
])

# Fit
gb_pipe.fit(X_train, y_train)
lr_pipe.fit(X_train, y_train)

# Quick validation scores
pred_gb = gb_pipe.predict(X_valid)
pred_lr = lr_pipe.predict(X_valid)
print("Valid acc GB:", round(accuracy_score(y_valid, pred_gb), 4))
print("Valid acc LR:", round(accuracy_score(y_valid, pred_lr), 4))


Valid acc GB: 0.8212
Valid acc LR: 0.8324


## 3. Data Cleaning & Feature Engineering

Based on our EDA, we'll clean the data by handling missing values and create new features to improve our model's performance.

In [33]:
# Train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessor (dense output for GB; OK for LR)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ]
)

# Gradient Boosting (params robustes pour Titanic)
gb_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=3, subsample=0.9, random_state=42))
])

# Logistic Regression baseline (utile pour un petit blend)
lr_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", LogisticRegression(max_iter=2000, solver="liblinear", random_state=42))
])

# Fit
gb_pipe.fit(X_train, y_train)
lr_pipe.fit(X_train, y_train)

# Quick validation scores
pred_gb = gb_pipe.predict(X_valid)
pred_lr = lr_pipe.predict(X_valid)
print("Valid acc GB:", round(accuracy_score(y_valid, pred_gb), 4))
print("Valid acc LR:", round(accuracy_score(y_valid, pred_lr), 4))


Valid acc GB: 0.8212
Valid acc LR: 0.8324


## 4. Model Training and Evaluation

It's time to choose a model, train it on our processed data, and see how well it performs.

In [34]:
# Small, fast blend to squeeze extra points
proba_gb = gb_pipe.predict_proba(X_valid)[:, 1]
proba_lr = lr_pipe.predict_proba(X_valid)[:, 1]

best_w, best_acc = 0.5, 0.0  # w = weight for GB
for w in np.linspace(0.2, 0.9, 15):  # small scan, fast
    blend = w*proba_gb + (1-w)*proba_lr
    preds = (blend >= 0.5).astype(int)
    acc = accuracy_score(y_valid, preds)
    if acc > best_acc:
        best_acc, best_w = acc, w

print("Best blend weight (GB weight):", round(best_w,3), "→ blended valid acc:", round(best_acc,4))


Best blend weight (GB weight): 0.25 → blended valid acc: 0.838


## 5. Create Submission File

Finally, we'll use our trained model to make predictions on the test set and generate the submission file in the required format.

In [35]:
# Final test predictions with the tuned blend
proba_gb_test = gb_pipe.predict_proba(X_test_final)[:, 1]
proba_lr_test = lr_pipe.predict_proba(X_test_final)[:, 1]
proba_blend   = best_w*proba_gb_test + (1-best_w)*proba_lr_test

final_preds = (proba_blend >= 0.5).astype(int)

submission = pd.DataFrame({
    "PassengerId": test_fe["PassengerId"],
    "Survived": final_preds
})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved. shape:", submission.shape)
submission.head()


submission.csv saved. shape: (418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
