# Titanic Classification

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Loading Datasets
pd.set_option('display.max_columns',10,'display.width',1000)
dataframe = pd.read_csv('titanic.csv')
dataframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,...,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,...,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,...,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,...,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,...,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,...,0,373450,8.05,,S


In [3]:
dataframe.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')

In [4]:
dataframe.drop(['PassengerId','Name','Ticket','Cabin',],axis =1,inplace=True)

In [5]:
#Checking for Null values
dataframe.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
dataframe.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [7]:
dataframe = pd.get_dummies(dataframe, columns=['Sex','Embarked'])

In [8]:
dataframe = dataframe.dropna()

In [9]:
dataframe.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,...,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,...,False,True,False,False,True
1,1,1,38.0,1,0,...,True,False,True,False,False
2,1,3,26.0,0,0,...,True,False,False,False,True
3,1,1,35.0,1,0,...,True,False,False,False,True
4,0,3,35.0,0,0,...,False,True,False,False,True


In [10]:
#Description of dataset
dataframe.describe(include="all")

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,...,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
count,714.0,714.0,714.0,714.0,714.0,...,714,714,714,714,714
unique,,,,,,...,2,2,2,2,2
top,,,,,,...,False,True,False,False,True
freq,,,,,,...,453,453,584,686,554
mean,0.406162,2.236695,29.699118,0.512605,0.431373,...,,,,,
std,0.49146,0.83825,14.526497,0.929783,0.853289,...,,,,,
min,0.0,1.0,0.42,0.0,0.0,...,,,,,
25%,0.0,1.0,20.125,0.0,0.0,...,,,,,
50%,0.0,2.0,28.0,0.0,0.0,...,,,,,
75%,1.0,3.0,38.0,1.0,1.0,...,,,,,


In [11]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    714 non-null    int64  
 1   Pclass      714 non-null    int64  
 2   Age         714 non-null    float64
 3   SibSp       714 non-null    int64  
 4   Parch       714 non-null    int64  
 5   Fare        714 non-null    float64
 6   Sex_female  714 non-null    bool   
 7   Sex_male    714 non-null    bool   
 8   Embarked_C  714 non-null    bool   
 9   Embarked_Q  714 non-null    bool   
 10  Embarked_S  714 non-null    bool   
dtypes: bool(5), float64(2), int64(4)
memory usage: 42.5 KB


In [12]:
dataframe.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,...,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
Survived,1.0,-0.359653,-0.077221,-0.017358,0.093317,...,0.538826,-0.538826,0.193607,-0.049549,-0.164235
Pclass,-0.359653,1.0,-0.369226,0.067247,0.025683,...,-0.15546,0.15546,-0.276294,0.132415,0.20398
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,...,-0.093254,0.093254,0.036261,-0.022405,-0.032523
SibSp,-0.017358,0.067247,-0.308247,1.0,0.38382,...,0.10395,-0.10395,-0.045462,0.051619,0.021751
Parch,0.093317,0.025683,-0.189119,0.38382,1.0,...,0.246972,-0.246972,-0.008846,-0.009126,0.015833
Fare,0.268189,-0.554182,0.096067,0.138329,0.205119,...,0.184994,-0.184994,0.299797,-0.062765,-0.253991
Sex_female,0.538826,-0.15546,-0.093254,0.10395,0.246972,...,1.0,-1.0,0.10158,0.02644,-0.115167
Sex_male,-0.538826,0.15546,0.093254,-0.10395,-0.246972,...,-1.0,1.0,-0.10158,-0.02644,0.115167
Embarked_C,0.193607,-0.276294,0.036261,-0.045462,-0.008846,...,0.10158,-0.10158,1.0,-0.09532,-0.877931
Embarked_Q,-0.049549,0.132415,-0.022405,0.051619,-0.009126,...,0.02644,-0.02644,-0.09532,1.0,-0.375934


In [13]:
x= dataframe.drop('Survived',axis = 1)
y = dataframe['Survived']

In [14]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [15]:
# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [16]:

# Hyperparameter tuning for each model using GridSearchCV
def hyperparameter_tuning(model, params, x_train, y_train):
    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    return grid_search.best_estimator_

In [17]:
mlp_params = {
    'hidden_layer_sizes': [(100,), (50, 50)],
    'alpha': [0.0001, 0.001],
    'max_iter': [600],  # Increased max_iter to 600
    'tol': [1e-3]       # Adjusted tolerance
}

In [18]:
models = [
    ('Logistic Regression', LogisticRegression(), {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']}, True),
    ('K-Nearest Neighbors', KNeighborsClassifier(), {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}, True),
    ('Decision Tree', DecisionTreeClassifier(), {'max_depth': [None, 10, 20], 'min_samples_split': [2, 10, 20]}, False),
    ('Random Forest', RandomForestClassifier(), {'n_estimators': [100, 200], 'max_features': ['auto', 'sqrt', 'log2']}, False),
    ('Gradient Boosting', GradientBoostingClassifier(), {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}, False),
    ('AdaBoost', AdaBoostClassifier(algorithm='SAMME'), {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}, False),
    ('Gaussian Naive Bayes', GaussianNB(), {}, False),
    ('Support Vector Classifier', SVC(probability=True), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}, True),
    ('MLP Classifier', MLPClassifier(max_iter=400, tol=1e-4), {'hidden_layer_sizes': [(100,), (50, 50)], 'alpha': [0.0001, 0.001],'max_iter':[600],'tol':[1e-3],}, True),
    ('Linear Discriminant Analysis', LinearDiscriminantAnalysis(), {}, True),
    ('Extra Trees Classifier', ExtraTreesClassifier(), {'n_estimators': [100, 200], 'max_features': ['auto', 'sqrt', 'log2']}, False),
    ('XGBoost', XGBClassifier(eval_metric='logloss'), {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}, False),
]

In [19]:
# DataFrame to store accuracy results
results = pd.DataFrame(columns=['Model', 'Accuracy'])

# Fit models, predict, and evaluate
best_estimators = []
for name, model, params, use_scaled in models:
    if use_scaled:
        best_model = hyperparameter_tuning(model, params, x_train_scaled, y_train)
        best_estimators.append((name, best_model))
        best_model.fit(x_train_scaled, y_train)
        y_pred = best_model.predict(x_test_scaled)
    else:
        best_model = hyperparameter_tuning(model, params, x_train, y_train)
        best_estimators.append((name, best_model))
        best_model.fit(x_train, y_train)
        y_pred = best_model.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("-" * 60)
    
    # Save the accuracy result
    result_df = pd.DataFrame({'Model': [name], 'Accuracy': [accuracy]})
    results = pd.concat([results, result_df], ignore_index=True)

Model: Logistic Regression
Accuracy: 0.7413
Confusion Matrix:
[[71 16]
 [21 35]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.82      0.79        87
           1       0.69      0.62      0.65        56

    accuracy                           0.74       143
   macro avg       0.73      0.72      0.72       143
weighted avg       0.74      0.74      0.74       143

------------------------------------------------------------
Model: K-Nearest Neighbors
Accuracy: 0.7902
Confusion Matrix:
[[73 14]
 [16 40]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83        87
           1       0.74      0.71      0.73        56

    accuracy                           0.79       143
   macro avg       0.78      0.78      0.78       143
weighted avg       0.79      0.79      0.79       143

------------------------------------------------------------
Model: Decision 

In [20]:
# Ensemble learning with the best estimators
ensemble_model = VotingClassifier(estimators=best_estimators, voting='soft')
ensemble_model.fit(x_train, y_train)
y_pred_ensemble = ensemble_model.predict(x_test)

accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
class_report_ensemble = classification_report(y_test, y_pred_ensemble)

print(f"Model: Ensemble Model")
print(f"Accuracy: {accuracy_ensemble:.4f}")
print("Confusion Matrix:")
print(conf_matrix_ensemble)
print("Classification Report:")
print(class_report_ensemble)
print("-" * 60)

Model: Ensemble Model
Accuracy: 0.8042
Confusion Matrix:
[[76 11]
 [17 39]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84        87
           1       0.78      0.70      0.74        56

    accuracy                           0.80       143
   macro avg       0.80      0.78      0.79       143
weighted avg       0.80      0.80      0.80       143

------------------------------------------------------------


In [21]:
# Save the accuracy result of the ensemble model
result_df_ensemble = pd.DataFrame({'Model': ['Ensemble Model'], 'Accuracy': [accuracy_ensemble]})
results = pd.concat([results, result_df_ensemble], ignore_index=True)
# Save results to a new DataFrame
results.to_csv('model_accuracies.csv', index=False)
print("Model accuracies saved to 'model_accuracies.csv'")

Model accuracies saved to 'model_accuracies.csv'


In [22]:
results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.741259
1,K-Nearest Neighbors,0.79021
2,Decision Tree,0.769231
3,Random Forest,0.762238
4,Gradient Boosting,0.79021
5,AdaBoost,0.734266
6,Gaussian Naive Bayes,0.776224
7,Support Vector Classifier,0.804196
8,MLP Classifier,0.804196
9,Linear Discriminant Analysis,0.762238
