TITANIC SURVIVAL PREDICTION

In [2]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv('https://storage.googleapis.com/kagglesdsdata/datasets/1818188/2965537/Titanic-Dataset.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250413%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250413T064623Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=26302e0362569b5af04656f7e590af97e46670c8cc685e0bb484df89880721f63da3630a7575bae6a269b9a4e56e3a4e65be3b7d03b2ec51bac69eb9e49428b852f44f171d74e4dc9fc2c1872be89ca3d6f634ec666dcb2564bb71a775b13b099065ddb5a60cea000cc1cbfd7c327c57c03988512d156d273b67d1a29f9c2c9e4896cb621e864aca85600554ece57d01ca4d34ad4bc7aef5fd52c03aab3640e0e371bc72339d7292b740fe86be27b4ecdadd8e04c7235fe74b63eb49a358711b406ad1d9733fa526416ea8c0a2866f5b0085c5965198979fbe3890bdb79779f591911a30ef3a0ac7c9be0631360113ca9cbbd41a33c778db9c067483354d0aca')

DATA PROCESSING

In [5]:
# Drop unnecessary columns
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)


In [None]:
# Handle missing values
imputer = SimpleImputer(strategy='median')
df['Age'] = imputer.fit_transform(df[['Age']])

In [7]:
# One-hot encoding for categorical variables
df = pd.get_dummies(df, columns=['Sex', 'Embarked'])

In [8]:
# Scale numerical variables
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

FEATURE ENGINEERING

In [9]:
# Calculate family size
df['FamilySize'] = df['SibSp'] + df['Parch']

In [10]:
# Create a binary feature indicating whether a passenger is traveling alone
df['IsAlone'] = np.where(df['FamilySize'] == 0, 1, 0)

MODEL SELECTION AND EVALUATION

In [11]:
# Split data into training and testing sets
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True)
}

In [13]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-score': f1_score(y_test, y_pred),
        'ROC-AUC score': roc_auc_score(y_test, y_pred_proba)
    }

HYPERPARAMETER TUNING

In [14]:
# Perform hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best parameters: {'C': 0.1, 'penalty': 'l2'}
Best score: 0.801920614596671


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\1464y\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\1464y\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\1464y\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(sel