In [16]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [63]:
df=pd.read_csv('titanic.csv')

In [64]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,WikiId,Name_wiki,Age_wiki,Hometown,Boarded,Destination,Lifeboat,Body,Class
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,S,691.0,"Braund, Mr. Owen Harris",22.0,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",,,3.0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,90.0,"Cumings, Mrs. Florence Briggs (née Thayer)",35.0,"New York, New York, US",Cherbourg,"New York, New York, US",4,,1.0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,S,865.0,"Heikkinen, Miss Laina",26.0,"Jyväskylä, Finland",Southampton,New York City,14?,,3.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,S,127.0,"Futrelle, Mrs. Lily May (née Peel)",35.0,"Scituate, Massachusetts, US",Southampton,"Scituate, Massachusetts, US",D,,1.0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,S,627.0,"Allen, Mr. William Henry",35.0,"Birmingham, West Midlands, England",Southampton,New York City,,,3.0


In [65]:
df.shape

(1309, 21)

In [66]:
df=df.iloc[:, 0:12]

In [67]:
df.shape

(1309, 12)

In [68]:
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [69]:
df.dropna(subset=['Survived'], inplace=True)

In [70]:
df.shape

(891, 12)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(3), int64(4), object(5)
memory usage: 90.5+ KB


In [72]:
X=df.drop(columns=['Name', 'Ticket', 'Cabin', 'Survived'])

In [73]:
X

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.2500,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.9250,S
3,4,1,female,35.0,1,0,53.1000,S
4,5,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,887,2,male,27.0,0,0,13.0000,S
887,888,1,female,19.0,0,0,30.0000,S
888,889,3,female,,1,2,23.4500,S
889,890,1,male,26.0,0,0,30.0000,C


In [74]:
y=df['Survived']
y

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

In [75]:
numeric_c=X.select_dtypes(include=['int64', 'float64']).columns
categorical_c=X.select_dtypes(include=['object']).columns

In [76]:
numeric_c

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [77]:
categorical_c

Index(['Sex', 'Embarked'], dtype='object')

In [78]:
numeric_transformer=Pipeline(steps=[
    ('si', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer=Pipeline(steps=[
    ('si', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

In [79]:
col_transformer=ColumnTransformer(
    transformers=[
        ('trf1', numeric_transformer, numeric_c),
        ('trf2', categorical_transformer, categorical_c )
        
    ], remainder='passthrough'
)

In [80]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

In [81]:
col_transformer.fit(X_train, y_train)

In [95]:
X_train=col_transformer.fit_transform(X_train)

In [96]:
X_train

array([[-1.05999611,  0.82989695, -0.64080618, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.19312106, -1.56720904,  3.22307462, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.33267449,  0.82989695, -0.33169572, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.86419655,  0.82989695, -0.09986287, ...,  1.        ,
         0.        ,  1.        ],
       [-0.13973818,  0.82989695, -0.64080618, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.66695599,  0.82989695,  0.44108044, ...,  1.        ,
         0.        ,  1.        ]])

In [97]:
X_test=col_transformer.fit_transform(X_test)

In [98]:
X_test

array([[-0.56829463,  0.81746073, -0.06401715, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.11123029,  0.81746073,  0.08668077, ...,  1.        ,
         0.        ,  1.        ],
       [-1.09030913,  0.81746073, -0.29006402, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 1.49328438,  0.81746073,  0.91551931, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.69891449, -0.37217724, -0.13936611, ...,  0.        ,
         0.        ,  1.        ],
       [-1.15083255,  0.81746073, -0.21471506, ...,  1.        ,
         0.        ,  1.        ]])

In [99]:
from scipy.stats import norm

In [100]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.model_selection import cross_val_score


In [101]:
models = {
    'logistic_regression': LogisticRegression(),
    'random_forest': RandomForestClassifier()
}


In [102]:
def bayesian_optimization(model, param_space, n_iter=50):
    def surrogate_func(x, model):
        mean, std=model.predict(x, return_std=True)
        return mean+std
    
    #considering exploration factor to be 1.96 that is the 99.75 percentile value
    def acquisition_func(x, model):
        mean, std=model.predict(x, return_std=True)
        return mean+1.96*std
    
    def optimize_acquisition(model, param_space):
        best_value=float('-inf')
        best_params=None
        
        for _ in range(100):
            params= {key: int(np.random.uniform(low, high)) if isinstance(low, int) 
                    else np.random.uniform(low, high)
                    for key, (low, high) in param_space.items()}
        
            value=acquisition_func(np.array([list(params.values())]), model)
            
            if value>best_value:
                best_value=value
                best_params=params
            return best_params
    X = []
    y = []

    kernel = Matern(length_scale_bounds=(1e-6, 1e2), nu=2.5)
    surrogate_model = GaussianProcessRegressor(kernel=kernel, alpha=1e-5, normalize_y=True, n_restarts_optimizer=10)

    for i in range(n_iter):
        if i > 0:
            params = optimize_acquisition(surrogate_model, param_space)
        else:
            params = {key: int(np.random.uniform(low, high)) if isinstance(low, int) else np.random.uniform(low, high) 
                      for key, (low, high) in param_space.items()}

        model.set_params(**params)
        model.fit(X_train, y_train)
        score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
        X.append(list(params.values()))
        y.append(score)

        if i > 0:
            surrogate_model.fit(np.array(X), np.array(y))

    best_params = X[np.argmax(y)]
    best_params_dict = {key: best_params[i] for i, key in enumerate(param_space.keys())}
    return best_params_dict 
        

In [104]:
def evaluate_model(model, X, y):
    cv_scores=cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    return cv_scores.mean()


In [109]:
param_space_lr = {
    'C': (0.01, 10),
    'max_iter': (100, 500)
}

best_params_lr = bayesian_optimization(models['logistic_regression'], param_space_lr)
print("Best parameters for Logistic Regression:", best_params_lr)

model_lr = models['logistic_regression']
model_lr.set_params(**best_params_lr)
model_lr.fit(X_train, y_train)
roc_auc_lr = evaluate_model(model_lr, X_test, y_test)
print("Optimized ROC AUC for Logistic Regression:", roc_auc_lr)




Best parameters for Logistic Regression: {'C': 0.41401305234139124, 'max_iter': 469}
Optimized ROC AUC for Logistic Regression: 0.8715257936507937


In [110]:
param_space_rf = {
    'n_estimators': (10, 300),
    'max_depth': (1, 10)
}

best_params_rf = bayesian_optimization(models['random_forest'], param_space_rf)
print("Best parameters for Random Forest:", best_params_rf)

model_rf = models['random_forest']
model_rf.set_params(**best_params_rf)
model_rf.fit(X_train, y_train)
roc_auc_rf = evaluate_model(model_rf, X_test, y_test)
print("Optimized ROC AUC for Random Forest:", roc_auc_rf)

Best parameters for Random Forest: {'n_estimators': 142, 'max_depth': 9}
Optimized ROC AUC for Random Forest: 0.8613650793650794
