In [13]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings

## Import the data as a pandas DataFrame

In [3]:
df = pd.read_csv("../data/processed/cleaned_data.csv")

In [4]:
df.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,identity,alone,survived
0,male,22,1,0,7.25,s,third,man,0,0
1,female,38,1,0,71.28,c,first,woman,0,1
2,female,26,0,0,7.92,s,third,woman,1,1
3,female,35,1,0,53.1,s,first,woman,0,1
4,male,35,0,0,8.05,s,third,man,1,0


In [5]:
X = df.drop('survived', axis= 1)
y = df['survived']

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [8]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns


numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()


In [9]:
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [10]:
X = preprocessor.fit_transform(X)

In [16]:
X

array([[ 0.        ,  1.        ,  0.        , ..., -0.47367361,
        -0.50244272, -1.2316449 ],
       [ 1.        ,  0.        ,  1.        , ..., -0.47367361,
         0.7867767 , -1.2316449 ],
       [ 1.        ,  0.        ,  0.        , ..., -0.47367361,
        -0.48895252,  0.81192233],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  2.00893337,
        -0.17626195, -1.2316449 ],
       [ 0.        ,  1.        ,  1.        , ..., -0.47367361,
        -0.04438022,  0.81192233],
       [ 0.        ,  1.        ,  0.        , ..., -0.47367361,
        -0.49237541,  0.81192233]], shape=(891, 16))

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((712, 16), (179, 16))

## Create an Evaluate Function to give all metrics after model Training

In [None]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    return accuracy, precision, recall

In [27]:
cat_boost = CatBoostClassifier(verbose= False)

In [30]:
import optuna
from sklearn.model_selection import cross_val_score

# Define models
models = {
    "Logistic Regression": LogisticRegression,
    "K-Neighbors Classifier": KNeighborsClassifier,
    "Decision Tree": DecisionTreeClassifier,
    "Random Forest Classifier": RandomForestClassifier,
    "XGBClassifier": XGBClassifier,
    "AdaBoost Classifier": AdaBoostClassifier
}

model_list = []
accuracy_list = []

# Dictionary of hyperparameter search spaces
def get_search_space(trial, model_name):
    if model_name == "Logistic Regression":
        return {
            "C": trial.suggest_float("C", 1e-3, 10.0, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"])
        }
    elif model_name == "K-Neighbors Classifier":
        return {
            "n_neighbors": trial.suggest_int("n_neighbors", 1, 30),
            "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
            "metric": trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"])
        }
    elif model_name == "Decision Tree":
        return {
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
            "max_depth": trial.suggest_int("max_depth", 1, 20),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20)
        }
    elif model_name == "Random Forest Classifier":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 2, 20),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20)
        }
    elif model_name == "XGBClassifier":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 2, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
        }
    # elif model_name == "CatBoosting Classifier":
    #     return {
    #         "iterations": trial.suggest_int("iterations", 50, 300),
    #         "depth": trial.suggest_int("depth", 2, 10),
    #         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3)
    #     }
    elif model_name == "AdaBoost Classifier":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 2.0)
        }





In [31]:
# Loop through models
for name, Model in models.items():
    print(f"Tuning {name} ...")

    def objective(trial):
        params = get_search_space(trial, name)
        model = Model(**params)
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
        return np.mean(scores)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=120)

    best_params = study.best_params
    best_score = study.best_value

    print(f"{name} best params: {best_params}")
    print(f"{name} best CV accuracy: {best_score:.4f}")
    print("="*40)

    model_list.append(name)
    accuracy_list.append(best_score)

[I 2025-10-03 09:33:09,976] A new study created in memory with name: no-name-efce5ded-97c1-4a80-a095-c1534868e8e9
[I 2025-10-03 09:33:10,049] Trial 0 finished with value: 0.8187826258248794 and parameters: {'C': 0.07918143743763476, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8187826258248794.


Tuning Logistic Regression ...


[I 2025-10-03 09:33:10,167] Trial 1 finished with value: 0.7134836993991923 and parameters: {'C': 0.002717919451038204, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8187826258248794.
[I 2025-10-03 09:33:10,257] Trial 2 finished with value: 0.8117797695262485 and parameters: {'C': 0.029846401115682365, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8187826258248794.
[I 2025-10-03 09:33:10,636] Trial 3 finished with value: 0.8201910765291046 and parameters: {'C': 5.466133732085814, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.8201910765291046.
[I 2025-10-03 09:33:10,789] Trial 4 finished with value: 0.8089628681177977 and parameters: {'C': 0.023666785117871507, 'solver': 'liblinear'}. Best is trial 3 with value: 0.8201910765291046.
[I 2025-10-03 09:33:10,924] Trial 5 finished with value: 0.8201910765291046 and parameters: {'C': 2.4522268551269155, 'solver': 'liblinear'}. Best is trial 3 with value: 0.8201910765291046.
[I 2025-10-03 09:33:11,057] Trial 6 finished with val

Logistic Regression best params: {'C': 9.990616818555862, 'solver': 'liblinear'}
Logistic Regression best CV accuracy: 0.8216
Tuning K-Neighbors Classifier ...


[I 2025-10-03 09:33:19,315] Trial 2 finished with value: 0.8047572146163695 and parameters: {'n_neighbors': 16, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 0 with value: 0.8145868216290753.
[I 2025-10-03 09:33:19,389] Trial 3 finished with value: 0.8202107751403526 and parameters: {'n_neighbors': 18, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 3 with value: 0.8202107751403526.
[I 2025-10-03 09:33:19,555] Trial 4 finished with value: 0.8188220230473753 and parameters: {'n_neighbors': 20, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 3 with value: 0.8202107751403526.
[I 2025-10-03 09:33:19,640] Trial 5 finished with value: 0.7935487048163105 and parameters: {'n_neighbors': 30, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 3 with value: 0.8202107751403526.
[I 2025-10-03 09:33:19,744] Trial 6 finished with value: 0.7935388555106865 and parameters: {'n_neighbors': 28, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 3

K-Neighbors Classifier best params: {'n_neighbors': 16, 'weights': 'uniform', 'metric': 'manhattan'}
K-Neighbors Classifier best CV accuracy: 0.8300
Tuning Decision Tree ...


[I 2025-10-03 09:33:31,842] Trial 3 finished with value: 0.8159558751108046 and parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 11, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.821619225844578.
[I 2025-10-03 09:33:31,925] Trial 4 finished with value: 0.7991726583275879 and parameters: {'criterion': 'entropy', 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.821619225844578.
[I 2025-10-03 09:33:32,010] Trial 5 finished with value: 0.8131980695360976 and parameters: {'criterion': 'gini', 'max_depth': 18, 'min_samples_split': 19, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.821619225844578.
[I 2025-10-03 09:33:32,095] Trial 6 finished with value: 0.8019403132079189 and parameters: {'criterion': 'entropy', 'max_depth': 11, 'min_samples_split': 15, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.821619225844578.
[I 2025-10-03 09:33:32,157] Trial 7 finished with value: 0.8159854230276766 and paramete

Decision Tree best params: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 16}
Decision Tree best CV accuracy: 0.8286
Tuning Random Forest Classifier ...


[I 2025-10-03 09:33:42,443] Trial 0 finished with value: 0.828612232837585 and parameters: {'n_estimators': 109, 'max_depth': 8, 'max_features': None, 'min_samples_split': 20}. Best is trial 0 with value: 0.828612232837585.
[I 2025-10-03 09:33:50,039] Trial 1 finished with value: 0.8314488328572835 and parameters: {'n_estimators': 248, 'max_depth': 17, 'max_features': None, 'min_samples_split': 10}. Best is trial 1 with value: 0.8314488328572835.
[I 2025-10-03 09:33:52,207] Trial 2 finished with value: 0.8258150300403821 and parameters: {'n_estimators': 81, 'max_depth': 20, 'max_features': None, 'min_samples_split': 17}. Best is trial 1 with value: 0.8314488328572835.
[I 2025-10-03 09:33:54,876] Trial 3 finished with value: 0.8272037821333595 and parameters: {'n_estimators': 103, 'max_depth': 3, 'max_features': None, 'min_samples_split': 12}. Best is trial 1 with value: 0.8314488328572835.
[I 2025-10-03 09:33:59,544] Trial 4 finished with value: 0.8258051807347583 and parameters: {'n_e

Random Forest Classifier best params: {'n_estimators': 141, 'max_depth': 6, 'max_features': None, 'min_samples_split': 10}
Random Forest Classifier best CV accuracy: 0.8385
Tuning XGBClassifier ...


[I 2025-10-03 09:35:42,817] Trial 0 finished with value: 0.8202403230572244 and parameters: {'n_estimators': 299, 'max_depth': 4, 'learning_rate': 0.06515978063767637, 'subsample': 0.8573420154015183, 'colsample_bytree': 0.5821516309991803}. Best is trial 0 with value: 0.8202403230572244.
[I 2025-10-03 09:35:43,973] Trial 1 finished with value: 0.8216487737614498 and parameters: {'n_estimators': 149, 'max_depth': 5, 'learning_rate': 0.09225790958995385, 'subsample': 0.983719618746181, 'colsample_bytree': 0.5694868073581532}. Best is trial 1 with value: 0.8216487737614498.
[I 2025-10-03 09:35:44,571] Trial 2 finished with value: 0.8146065202403232 and parameters: {'n_estimators': 122, 'max_depth': 4, 'learning_rate': 0.1593903807555641, 'subsample': 0.7506679003879086, 'colsample_bytree': 0.8260312050177054}. Best is trial 1 with value: 0.8216487737614498.
[I 2025-10-03 09:35:46,332] Trial 3 finished with value: 0.8202501723628485 and parameters: {'n_estimators': 231, 'max_depth': 7, 'l

XGBClassifier best params: {'n_estimators': 76, 'max_depth': 3, 'learning_rate': 0.06948785288468902, 'subsample': 0.9161273911886779, 'colsample_bytree': 0.5976360222247314}
XGBClassifier best CV accuracy: 0.8371
Tuning AdaBoost Classifier ...


[I 2025-10-03 09:37:11,193] Trial 0 finished with value: 0.8117797695262485 and parameters: {'n_estimators': 206, 'learning_rate': 1.2500634100771955}. Best is trial 0 with value: 0.8117797695262485.
[I 2025-10-03 09:37:19,322] Trial 1 finished with value: 0.8187924751305033 and parameters: {'n_estimators': 283, 'learning_rate': 0.6205791687645112}. Best is trial 1 with value: 0.8187924751305033.
[I 2025-10-03 09:37:25,555] Trial 2 finished with value: 0.8145966709346991 and parameters: {'n_estimators': 237, 'learning_rate': 0.6015493658915486}. Best is trial 1 with value: 0.8187924751305033.
[I 2025-10-03 09:37:31,549] Trial 3 finished with value: 0.8103811681276468 and parameters: {'n_estimators': 230, 'learning_rate': 1.0814525944179811}. Best is trial 1 with value: 0.8187924751305033.
[I 2025-10-03 09:37:36,798] Trial 4 finished with value: 0.8187924751305033 and parameters: {'n_estimators': 189, 'learning_rate': 0.27419556442134857}. Best is trial 1 with value: 0.8187924751305033.

AdaBoost Classifier best params: {'n_estimators': 51, 'learning_rate': 1.6484164264068684}
AdaBoost Classifier best CV accuracy: 0.8287


In [32]:
# Summary
print("Summary of Best Scores:")
for model, acc in zip(model_list, accuracy_list):
    print(f"{model}: {acc:.4f}")


Summary of Best Scores:
Logistic Regression: 0.8216
K-Neighbors Classifier: 0.8300
Decision Tree: 0.8286
Random Forest Classifier: 0.8385
XGBClassifier: 0.8371
AdaBoost Classifier: 0.8287


In [35]:
params = {'n_estimators': 141, 'max_depth': 6, 'max_features': None, 'min_samples_split': 10}
model = RandomForestClassifier(**params)
model.fit(X_train, y_train)

0,1,2
,n_estimators,141
,criterion,'gini'
,max_depth,6
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
y_pred = model.predict(X_test)

In [39]:
from sklearn.metrics import classification_report

In [40]:
print("\nClassification Report\n")

print(classification_report(y_test, y_pred))


Classification Report

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       105
           1       0.82      0.69      0.75        74

    accuracy                           0.81       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179

