### Importing Basic Python Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [2]:
# Ignore warnings in below cell runs
import warnings
warnings.filterwarnings('ignore')

### Read Input File for Training and understanding the Dataset

In [3]:
train_df = pd.read_csv("./titanic/train.csv", sep=",")
test_df = pd.read_csv("./titanic/test.csv", sep=",")
gender_sub_df = pd.read_csv("./titanic/gender_submission.csv", sep=",")

train_df.shape, test_df.shape, gender_sub_df.shape

((891, 12), (418, 11), (418, 2))

In [4]:
test_df = pd.merge(test_df, gender_sub_df[["PassengerId", "Survived"]], on = "PassengerId", how = 'left')

In [5]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
# Here we can use strip if needed
for col in train_df.columns:
    print("Number of null values present in the", col, " : ",train_df[col].isnull().sum())

print("Total number of Null/Blank Values is:", train_df.isnull().sum().sum())
print("Number of rows with any blank value:", train_df[train_df.isnull().any(axis=1)].count()) # Check each row and identify blank in atleast one column

Number of null values present in the PassengerId  :  0
Number of null values present in the Survived  :  0
Number of null values present in the Pclass  :  0
Number of null values present in the Name  :  0
Number of null values present in the Sex  :  0
Number of null values present in the Age  :  177
Number of null values present in the SibSp  :  0
Number of null values present in the Parch  :  0
Number of null values present in the Ticket  :  0
Number of null values present in the Fare  :  0
Number of null values present in the Cabin  :  687
Number of null values present in the Embarked  :  2
Total number of Null/Blank Values is: 866
Number of rows with any blank value: PassengerId    708
Survived       708
Pclass         708
Name           708
Sex            708
Age            531
SibSp          708
Parch          708
Ticket         708
Fare           708
Cabin           21
Embarked       706
dtype: int64


In [8]:
# Here we can use strip if needed
for col in test_df.columns:
    print("Number of null values present in the", col, " : ",test_df[col].isnull().sum())

print("Total number of Null/Blank Values is:", test_df.isnull().sum().sum())
print("Number of rows with any blank value:", train_df[train_df.isnull().any(axis=1)].count()) # Check each row and identify blank in atleast one column

Number of null values present in the PassengerId  :  0
Number of null values present in the Pclass  :  0
Number of null values present in the Name  :  0
Number of null values present in the Sex  :  0
Number of null values present in the Age  :  86
Number of null values present in the SibSp  :  0
Number of null values present in the Parch  :  0
Number of null values present in the Ticket  :  0
Number of null values present in the Fare  :  1
Number of null values present in the Cabin  :  327
Number of null values present in the Embarked  :  0
Number of null values present in the Survived  :  0
Total number of Null/Blank Values is: 414
Number of rows with any blank value: PassengerId    708
Survived       708
Pclass         708
Name           708
Sex            708
Age            531
SibSp          708
Parch          708
Ticket         708
Fare           708
Cabin           21
Embarked       706
dtype: int64


In [9]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
pd.crosstab(train_df['Sex'], train_df['Survived'])

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [11]:
train_df_corr = train_df[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].corr()

train_df_corr

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Pclass,-0.035144,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,-0.5495,0.096067,0.159651,0.216225,1.0


### Feature Engineering & Data Visualization

In [None]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.models import ColumnDataSource, HoverTool

def plot_null_values(df):
    """
    Generates a Bokeh bar graph visualizing null values per attribute in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
    """
    null_counts = df.isnull().sum()
    data_types = df.dtypes
    columns = df.columns

    summary_df = pd.DataFrame({"Attributes" : columns, 'Null Count': null_counts, 'Data Type': data_types})
    # print(summary_df)

    summary_df1 = pd.DataFrame()
    for i_in, i in summary_df.iterrows():
        if i["Null Count"] != 0:
            summary_df1 = pd.concat([summary_df1, i.to_frame().T], ignore_index=True)

    attributes = list(summary_df1["Attributes"])
    counts = list(summary_df1["Null Count"])
    d_types = list(summary_df1["Data Type"].astype(str)) # Convert data types to strings

    source = ColumnDataSource(data=dict(attributes=attributes, counts=counts, data_type=d_types)) # Added 'data_type'

    p = figure(x_range=attributes, height=350, width=1000, title="Null Values per Attribute", toolbar_location="below", sizing_mode="stretch_both")

    p.vbar(x='attributes', top='counts', width=0.9, source=source)
    hover = HoverTool(tooltips=[("Null Count", "@counts"), ("Data Type", "@data_type")]) # Changed "@d_types" to "@data_type"
    p.add_tools(hover)
    p.xgrid.grid_line_color = None
    p.y_range.start = 0
    p.xaxis.axis_label = "Attributes"
    p.yaxis.axis_label = "Number of Null Values"

    show(p)


In [34]:
plot_null_values(train_df)

In [35]:
plot_null_values(test_df)

In [13]:
train_df_x = train_df[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']] #'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'
train_df_y = train_df[['Survived']]

In [14]:
train_df_x["Age"].fillna(0) # In general this fillna() works but this is not working, so we are using different process to replace the "NaN"
train_df_x["Age"] = np.where(train_df_x["Age"].isin(["NaN", np.nan]), 0, train_df_x["Age"])
train_df_x["Age"].isnull().sum() # the np.where is working in this case

np.int64(0)

In [15]:
test_df_x = test_df[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']] #'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'
test_df_y = test_df[['Survived']]

In [16]:
test_df_x["Age"] = np.where(test_df_x["Age"].isin(["NaN", np.nan]), 0, test_df_x["Age"])
test_df_x["Fare"] = np.where(test_df_x["Fare"].isin(["NaN", np.nan]), 0, test_df_x["Fare"])

### Models and their comparisons

In [17]:
# Metric calculator
def metrics_calculation(y_true, y_pred):
    output = {}
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() # 
    output["True Negative"] = int(str(tn))
    output["False Positive"] = int(str(fp))
    output["False Negative"] = int(str(fn))
    output["True Positive"] = int(str(tp))
    
    train_df_results = pd.DataFrame({'Y_true': y_true, 'Y_pred': y_pred})

    # titanic_crosstab = pd.crosstab(train_df_results.Y_pred, train_df_results.Y_train)

    # print("-"*50)

    acc = accuracy_score(train_df_results.Y_true, train_df_results.Y_pred)
    prec = precision_score(train_df_results.Y_true, train_df_results.Y_pred)
    recall = recall_score(train_df_results.Y_true, train_df_results.Y_pred)

    output["Accuracy"] = acc
    output["Precision"] = prec
    output["Recall"] = recall

    print(classification_report(y_true, y_pred))
    print("-"*80)

    return output

#### Logistic Regression : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logis_reg = LogisticRegression()

In [19]:
logis_reg.fit(train_df_x, train_df_y)

In [20]:
print(logis_reg.intercept_) # Displays the intercept contain in the estimated ("fitted") object from the 'LogisticRegression' class.
print(logis_reg.coef_) # Displays the coefficients contained in the estimated ("fitted") object from the 'LogisticRegression' class.
feature_name = train_df_x.columns.values # Stores the names of the columns of a dataframe in a variable.

[1.77681388]
[[-1.63335032e-04 -8.47245979e-01 -1.76359617e-02 -1.89825024e-01
   2.73864294e-01  4.72616441e-03]]


In [21]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
# Creates a dataframe with a column titled 'Feature name' and row values contained in the 'feature_name' variable.
summary_table['Coefficients'] = np.transpose(logis_reg.coef_)
# Creates a new column in the dataframe, called 'Coefficients',
# with row values the transposed coefficients from the 'LogisticRegression' object.
summary_table.index = summary_table.index + 1
# Increases the index of every row of the dataframe with 1.
summary_table.loc[0] = ['Intercept', logis_reg.intercept_[0]]
# Assigns values of the row with index 0 of the dataframe.
summary_table = summary_table.sort_index()
# Sorts the dataframe by index.
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,1.776814
1,PassengerId,-0.000163
2,Pclass,-0.847246
3,Age,-0.017636
4,SibSp,-0.189825
5,Parch,0.273864
6,Fare,0.004726


In [22]:
y_train_hat = logis_reg.predict(train_df_x)
metrics_calculation(train_df_y.Survived, y_train_hat)

              precision    recall  f1-score   support

           0       0.72      0.86      0.78       549
           1       0.67      0.46      0.55       342

    accuracy                           0.71       891
   macro avg       0.69      0.66      0.66       891
weighted avg       0.70      0.71      0.69       891

--------------------------------------------------------------------------------


{'True Negative': 471,
 'False Positive': 78,
 'False Negative': 184,
 'True Positive': 158,
 'Accuracy': 0.7059483726150393,
 'Precision': 0.6694915254237288,
 'Recall': 0.4619883040935672}

In [23]:
y_test_hat = logis_reg.predict(test_df_x)
metrics_calculation(test_df_y.Survived, y_test_hat)

              precision    recall  f1-score   support

           0       0.68      0.80      0.73       266
           1       0.50      0.35      0.41       152

    accuracy                           0.63       418
   macro avg       0.59      0.57      0.57       418
weighted avg       0.61      0.63      0.62       418

--------------------------------------------------------------------------------


{'True Negative': 212,
 'False Positive': 54,
 'False Negative': 99,
 'True Positive': 53,
 'Accuracy': 0.6339712918660287,
 'Precision': 0.4953271028037383,
 'Recall': 0.34868421052631576}

#### Decision Tree : https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [24]:
from sklearn.tree import DecisionTreeClassifier

decision_tr = DecisionTreeClassifier(max_depth = 4)
decision_tr.fit(train_df_x, train_df_y)

In [25]:
y_train_hat = decision_tr.predict(train_df_x)
metrics_calculation(train_df_y.Survived, y_train_hat)

              precision    recall  f1-score   support

           0       0.78      0.80      0.79       549
           1       0.67      0.64      0.65       342

    accuracy                           0.74       891
   macro avg       0.72      0.72      0.72       891
weighted avg       0.74      0.74      0.74       891

--------------------------------------------------------------------------------


{'True Negative': 439,
 'False Positive': 110,
 'False Negative': 123,
 'True Positive': 219,
 'Accuracy': 0.7384960718294051,
 'Precision': 0.6656534954407295,
 'Recall': 0.6403508771929824}

In [26]:
y_test_hat = logis_reg.predict(test_df_x)
metrics_calculation(test_df_y.Survived, y_test_hat)

              precision    recall  f1-score   support

           0       0.68      0.80      0.73       266
           1       0.50      0.35      0.41       152

    accuracy                           0.63       418
   macro avg       0.59      0.57      0.57       418
weighted avg       0.61      0.63      0.62       418

--------------------------------------------------------------------------------


{'True Negative': 212,
 'False Positive': 54,
 'False Negative': 99,
 'True Positive': 53,
 'Accuracy': 0.6339712918660287,
 'Precision': 0.4953271028037383,
 'Recall': 0.34868421052631576}

#### Naive Bayes : https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [27]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(train_df_x, train_df_y)

In [28]:
y_train_hat = decision_tr.predict(train_df_x)
metrics_calculation(train_df_y.Survived, y_train_hat)

              precision    recall  f1-score   support

           0       0.78      0.80      0.79       549
           1       0.67      0.64      0.65       342

    accuracy                           0.74       891
   macro avg       0.72      0.72      0.72       891
weighted avg       0.74      0.74      0.74       891

--------------------------------------------------------------------------------


{'True Negative': 439,
 'False Positive': 110,
 'False Negative': 123,
 'True Positive': 219,
 'Accuracy': 0.7384960718294051,
 'Precision': 0.6656534954407295,
 'Recall': 0.6403508771929824}

In [29]:
y_test_hat = logis_reg.predict(test_df_x)
metrics_calculation(test_df_y.Survived, y_test_hat)

              precision    recall  f1-score   support

           0       0.68      0.80      0.73       266
           1       0.50      0.35      0.41       152

    accuracy                           0.63       418
   macro avg       0.59      0.57      0.57       418
weighted avg       0.61      0.63      0.62       418

--------------------------------------------------------------------------------


{'True Negative': 212,
 'False Positive': 54,
 'False Negative': 99,
 'True Positive': 53,
 'Accuracy': 0.6339712918660287,
 'Precision': 0.4953271028037383,
 'Recall': 0.34868421052631576}

### Hyper-Parameter Tuning : 
#### a. Grid-Search : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
#### b. Random-Search :https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

In [30]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2, 4, 5, 7, 9, 10]} # set the possible max depth parameter

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(train_df_x, train_df_y)

grid_search.best_params_

{'max_depth': 4}

In [31]:
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])

    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.6767676767676768
Rank:  2
Parameters:  {'max_depth': 4}
Mean Test Score:  0.6857463524130191
Rank:  1
Parameters:  {'max_depth': 5}
Mean Test Score:  0.5858585858585859
Rank:  4
Parameters:  {'max_depth': 7}
Mean Test Score:  0.5892255892255892
Rank:  3
Parameters:  {'max_depth': 9}
Mean Test Score:  0.5757575757575758
Rank:  5
Parameters:  {'max_depth': 10}
Mean Test Score:  0.5634118967452301
Rank:  6


In [32]:
decision_tree_model = DecisionTreeClassifier(max_depth = grid_search.best_params_['max_depth']).fit(train_df_x, train_df_y)

In [33]:
y_test_hat = decision_tree_model.predict(test_df_x)
metrics_calculation(test_df_y.Survived, y_test_hat)

              precision    recall  f1-score   support

           0       0.74      0.64      0.68       266
           1       0.49      0.61      0.54       152

    accuracy                           0.62       418
   macro avg       0.61      0.62      0.61       418
weighted avg       0.65      0.62      0.63       418

--------------------------------------------------------------------------------


{'True Negative': 169,
 'False Positive': 97,
 'False Negative': 60,
 'True Positive': 92,
 'Accuracy': 0.6244019138755981,
 'Precision': 0.48677248677248675,
 'Recall': 0.6052631578947368}