<a href="https://colab.research.google.com/github/Sornambal/Titanic-Survival-Prediction/blob/main/Titanic_Survival_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

!pip install catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# 2) Load data (adjust path if needed)
train = pd.read_csv("/content/train.csv")
test  = pd.read_csv("/content/test.csv")

In [None]:
# Save original PassengerId for submission
test_ids = test["PassengerId"].copy()

In [None]:

# 3) Combine for unified feature engineering
df = pd.concat([train, test], sort=False, ignore_index=True)

In [None]:
# 4) Basic fills
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Fare"] = df["Fare"].fillna(df["Fare"].median())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

In [None]:
# 5) Basic features
df["Sex"] = df["Sex"].map({"male":0, "female":1})
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
df["FarePerPerson"] = df["Fare"] / df["FamilySize"]

In [None]:
# 6) Titles & surname
df["Title"] = df["Name"].str.extract(r' ([A-Za-z]+)\.', expand=False)
df["Title"] = df["Title"].replace({
    'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs',
    'Countess':'Rare','Lady':'Rare','Dona':'Rare',
    'Don':'Rare','Sir':'Rare','Jonkheer':'Rare',
    'Capt':'Rare','Col':'Rare','Major':'Rare','Rev':'Rare','Dr':'Rare'
})

df["Surname"] = df["Name"].apply(lambda x: x.split(",")[0].strip())

In [None]:
# 7) Ticket prefix and group
def ticket_prefix(t):
    t = str(t)
    t = t.replace('.', '').replace('/', '').strip()
    parts = t.split()
    pref = ''.join([p for p in parts if not p.isdigit()])
    return pref if pref != "" else "NONE"

df["TicketPrefix"] = df["Ticket"].apply(ticket_prefix)
df["TicketGroup"] = df.groupby("Ticket")["Ticket"].transform("count")


In [None]:
# 8) Cabin / Deck extraction
df["Cabin"] = df["Cabin"].fillna("Unknown")
df["Deck"] = df["Cabin"].astype(str).str[0]
df["Deck"] = df["Deck"].replace("n", "Unknown").replace("", "Unknown")


In [None]:
# 9) Create survival signals from TRAIN data (leaks)
# We compute surname-level, ticket-level, family-size level historical survival rates using only TRAIN rows.
train_rows = df.loc[:len(train)-1].copy()

# surname survival: mean and count
surname_stats = train_rows.groupby("Surname")["Survived"].agg(["mean","count"]).reset_index().rename(columns={"mean":"SurnameSurvivalRate","count":"SurnameCount"})
surname_stats_dict = surname_stats.set_index("Surname").to_dict(orient="index")

# ticket survival
ticket_stats = train_rows.groupby("Ticket")["Survived"].agg(["mean","count"]).reset_index().rename(columns={"mean":"TicketSurvivalRate","count":"TicketCount"})
ticket_stats_dict = ticket_stats.set_index("Ticket").to_dict(orient="index")

# family size survival
familysize_stats = train_rows.groupby("FamilySize")["Survived"].mean().to_dict()

# deck survival
deck_stats = train_rows.groupby("Deck")["Survived"].agg(["mean","count"]).reset_index().rename(columns={"mean":"DeckSurvivalRate","count":"DeckCount"})
deck_stats_dict = deck_stats.set_index("Deck").to_dict(orient="index")

# Apply mapping to full df (fill unknowns)
df["SurnameSurvivalRate"] = df["Surname"].map(lambda s: surname_stats_dict.get(s, {"SurnameSurvivalRate":np.nan})["SurnameSurvivalRate"])
df["SurnameCount"] = df["Surname"].map(lambda s: surname_stats_dict.get(s, {"SurnameCount":0})["SurnameCount"])

df["TicketSurvivalRate"] = df["Ticket"].map(lambda t: ticket_stats_dict.get(t, {"TicketSurvivalRate":np.nan})["TicketSurvivalRate"])
df["TicketCount"] = df["Ticket"].map(lambda t: ticket_stats_dict.get(t, {"TicketCount":0})["TicketCount"])

df["FamilySizeSurvivalRate"] = df["FamilySize"].map(lambda fs: familysize_stats.get(fs, np.nan))

df["DeckSurvivalRate"] = df["Deck"].map(lambda d: deck_stats_dict.get(d, {"DeckSurvivalRate":np.nan})["DeckSurvivalRate"])
df["DeckCount"] = df["Deck"].map(lambda d: deck_stats_dict.get(d, {"DeckCount":0})["DeckCount"])

# Fill missing survival rates with neutral 0.5
df["SurnameSurvivalRate"] = df["SurnameSurvivalRate"].fillna(0.5)
df["TicketSurvivalRate"] = df["TicketSurvivalRate"].fillna(0.5)
df["FamilySizeSurvivalRate"] = df["FamilySizeSurvivalRate"].fillna(0.5)
df["DeckSurvivalRate"] = df["DeckSurvivalRate"].fillna(0.5)


In [None]:
# Recreate Title if missing
if "Title" not in df.columns:
    df["Title"] = df["Name"].str.extract(r' ([A-Za-z]+)\.', expand=False)
    df["Title"] = df["Title"].replace({
        'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs',
        'Countess':'Rare','Lady':'Rare','Dona':'Rare',
        'Don':'Rare','Sir':'Rare','Jonkheer':'Rare',
        'Capt':'Rare','Col':'Rare','Major':'Rare','Rev':'Rare','Dr':'Rare'
    })

# Recreate TicketPrefix2 if missing
if "TicketPrefix2" not in df.columns:
    def ticket_prefix(t):
        t = str(t).replace('.', '').replace('/', '').strip()
        parts = t.split()
        pref = ''.join([p for p in parts if not p.isdigit()])
        return pref if pref != "" else "NONE"

    df["TicketPrefix"] = df["Ticket"].apply(ticket_prefix)
    top_ticket_prefixes = df["TicketPrefix"].value_counts().nlargest(20).index.tolist()
    df["TicketPrefix2"] = df["TicketPrefix"].apply(lambda x: x if x in top_ticket_prefixes else "OTHER")

# Recreate Deck if missing
if "Deck" not in df.columns:
    df["Cabin"] = df["Cabin"].fillna("Unknown")
    df["Deck"] = df["Cabin"].astype(str).str[0]
    df["Deck"].replace("n", "Unknown", inplace=True)


In [None]:
# 10) Encode categorical columns - keep manageable cardinality
# For TicketPrefix: keep top prefixes only, others mapped to "OTHER"
top_ticket_prefixes = df["TicketPrefix"].value_counts().nlargest(20).index.tolist()
df["TicketPrefix2"] = df["TicketPrefix"].apply(lambda x: x if x in top_ticket_prefixes else "OTHER")

# Titles one-hot
df = pd.get_dummies(df, columns=["Title","TicketPrefix2","Deck"], drop_first=True)


In [None]:
# 11) Features list - include leak signals and engineered features
base_features = [
    "Pclass","Sex","Age","Fare","FamilySize","IsAlone","FarePerPerson",
    "SurnameSurvivalRate","SurnameCount","TicketSurvivalRate","TicketCount",
    "FamilySizeSurvivalRate","DeckSurvivalRate","DeckCount","TicketGroup"
]

# add newly created dummies to feature list
dummies = [c for c in df.columns if c.startswith(("Title_","TicketPrefix2_","Deck_"))]
features = base_features + dummies


In [None]:
# 12) Split back to train / test
train_df = df.iloc[:len(train)].copy()
test_df  = df.iloc[len(train):].copy()

# 13) Prepare ML train data
X = train_df[features].fillna(0)
y = train_df["Survived"].astype(int)

X_test = test_df[features].fillna(0)

# Convert to numeric dtypes for CatBoost
X = X.astype("float32")
X_test = X_test.astype("float32")

In [None]:
# ---------------------------------------------------
# HARD RESET: remove duplicate columns everywhere
# ---------------------------------------------------

# Remove duplicates in df
df = df.loc[:, ~df.columns.duplicated()]

# Rebuild train_df and test_df cleanly
train_df = df.iloc[:len(train)].copy()
test_df  = df.iloc[len(train):].copy()

# Rebuild feature list cleanly
dummies = [c for c in df.columns if c.startswith(("Title_", "TicketPrefix2_", "Deck_"))]

base_features = [
    "Pclass", "Sex", "Age", "Fare", "FamilySize", "IsAlone", "FarePerPerson",
    "SurnameSurvivalRate", "SurnameCount",
    "TicketSurvivalRate", "TicketCount",
    "FamilySizeSurvivalRate", "DeckSurvivalRate", "DeckCount",
    "TicketGroup"
]

features = base_features + dummies

# Remove any duplicates from features list
features = list(dict.fromkeys(features))

# Prepare train/test
X = train_df[features].fillna(0).astype("float32")
y = train_df["Survived"].astype(int)

X_test = test_df[features].fillna(0).astype("float32")

print("Features Count:", len(features))
print("Unique Columns:", len(df.columns))
print("Shape X:", X.shape)


Features Count: 47
Unique Columns: 57
Shape X: (891, 47)


In [None]:
[x for x in df.columns if df.columns.duplicated()[df.columns.get_loc(x)]]


[]

In [None]:
# 14) Train a strong CatBoost model
cat = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.03,
    depth=6,
    random_seed=42,
    verbose=0,
    early_stopping_rounds=100
)

cat.fit(X, y)

# Obtain probability predictions for test
proba_test = cat.predict_proba(X_test)[:,1]

# Put initial ML predictions in the test dataframe
test_df = test_df.reset_index(drop=True)
test_df["ml_proba"] = proba_test
test_df["ml_pred"] = (test_df["ml_proba"] >= 0.5).astype(int)

In [None]:
# -----------------------------------------------------------
# STEP 15 — RULE-BASED OVERRIDE ENGINE (SAFE VERSION)
# -----------------------------------------------------------

# Start with all Survived_final = NaN so rules can overwrite
test_df["Survived_final"] = np.nan

# A) FAMILY (Surname) based deterministic survival (from train)
if "Surname" in test_df.columns:
    survived_surnames = surname_stats[
        (surname_stats["SurnameSurvivalRate"] == 1.0) &
        (surname_stats["SurnameCount"] >= 1)
    ]["Surname"].tolist()

    dead_surnames = surname_stats[
        (surname_stats["SurnameSurvivalRate"] == 0.0) &
        (surname_stats["SurnameCount"] >= 1)
    ]["Surname"].tolist()

    test_df.loc[test_df["Surname"].isin(survived_surnames), "Survived_final"] = 1
    test_df.loc[test_df["Surname"].isin(dead_surnames),    "Survived_final"] = 0


# B) TICKET-LEVEL SURVIVAL (deterministic)
if "TicketSurvivalRate" in test_df.columns:
    test_df.loc[(test_df["TicketSurvivalRate"] == 1.0), "Survived_final"] = 1
    test_df.loc[(test_df["TicketSurvivalRate"] == 0.0), "Survived_final"] = 0


# C) CHILDREN PRIORITY RULE
test_df.loc[test_df["Age"] < 12, "Survived_final"] = 1


# D) DECK-LEVEL SURVIVAL RULE
if "DeckSurvivalRate" in test_df.columns:
    test_df.loc[test_df["DeckSurvivalRate"] > 0.80, "Survived_final"] = 1
    test_df.loc[test_df["DeckSurvivalRate"] < 0.20, "Survived_final"] = 0


# E) MALE 3RD CLASS RULE (historically mostly died)
test_df.loc[
    (test_df["Sex"] == 0) &
    (test_df["Pclass"] == 3) &
    (test_df["Age"] > 12),
    "Survived_final"
] = 0


# F) TICKET GROUP SURVIVAL RULE
if "TicketGroupSurvRate" in test_df.columns:
    test_df.loc[(test_df["TicketGroupSurvRate"] == 1), "Survived_final"] = 1
    test_df.loc[(test_df["TicketGroupSurvRate"] == 0), "Survived_final"] = 0


# G) EXTREME ML OVERRIDES
# (If ML is extremely confident, accept it)
test_df.loc[test_df["ml_proba"] > 0.98, "Survived_final"] = 1
test_df.loc[test_df["ml_proba"] < 0.02, "Survived_final"] = 0


# -----------------------------------------------------------
# FINAL FILL for rows not covered by rules
# (Use ML prediction for remaining)
# -----------------------------------------------------------
test_df["Survived_final"] = test_df["Survived_final"].fillna(test_df["ml_pred"]).astype(int)

print("Rule-based predictions applied successfully.")


Rule-based predictions applied successfully.


In [None]:
# -----------------------------------------------------------
# STEP 16 — Prepare Kaggle Submission
# -----------------------------------------------------------

# Create submission DataFrame
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"].astype(int),
    "Survived": test_df["Survived_final"].astype(int)
})

# Sort to match required format (not mandatory but good practice)
submission = submission.sort_values("PassengerId").reset_index(drop=True)

print(submission.head())
print(submission.tail())
print("Submission shape:", submission.shape)


   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
     PassengerId  Survived
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1
Submission shape: (418, 2)


In [None]:
# -----------------------------------------------------------
# STEP 17 — Save submission file for Kaggle
# -----------------------------------------------------------

submission_file = "submission.csv"
submission.to_csv(submission_file, index=False)

print("Saved:", submission_file)


Saved: submission.csv
