In [53]:
# Import required libraries and dependencies
import os
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [54]:
# Load the data into a Pandas DataFrame
df_raw_data = pd.read_csv("Resources/raw_dataset.csv")

# Display sample data
df_raw_data.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body
0,1,1,Allen. Miss. Elisabeth Walton,female,29.0,0,0,24160,211.3375,B5,S,2,
1,1,1,Allison. Master. Hudson Trevor,male,0.9167,1,2,113781,151.55,C22 C26,S,11,
2,1,0,Allison. Miss. Helen Loraine,female,2.0,1,2,113781,151.55,C22 C26,S,,
3,1,0,Allison. Mr. Hudson Joshua Creighton,male,30.0,1,2,113781,151.55,C22 C26,S,,135.0
4,1,0,Allison. Mrs. Hudson J C (Bessie Waldo Daniels),female,25.0,1,2,113781,151.55,C22 C26,S,,
5,1,1,Anderson. Mr. Harry,male,48.0,0,0,19952,26.55,E12,S,3,
6,1,1,Andrews. Miss. Kornelia Theodosia,female,63.0,1,0,13502,77.9583,D7,S,10,
7,1,0,Andrews. Mr. Thomas Jr,male,39.0,0,0,112050,0.0,A36,S,,
8,1,1,Appleton. Mrs. Edward Dale (Charlotte Lamson),female,53.0,2,0,11769,51.4792,C101,S,D,
9,1,0,Artagaveytia. Mr. Ramon,male,71.0,0,0,PC 17609,49.5042,,C,,22.0


In [55]:
# Dropping  the “Cabin” column from the data frame as it won’t be of much importance
titanic_data = df_raw_data.drop(columns=['cabin','boat','body','sibsp','parch'], axis=1)
titanic_data.head(10)


Unnamed: 0,pclass,survived,name,sex,age,ticket,fare,embarked
0,1,1,Allen. Miss. Elisabeth Walton,female,29.0,24160,211.3375,S
1,1,1,Allison. Master. Hudson Trevor,male,0.9167,113781,151.55,S
2,1,0,Allison. Miss. Helen Loraine,female,2.0,113781,151.55,S
3,1,0,Allison. Mr. Hudson Joshua Creighton,male,30.0,113781,151.55,S
4,1,0,Allison. Mrs. Hudson J C (Bessie Waldo Daniels),female,25.0,113781,151.55,S
5,1,1,Anderson. Mr. Harry,male,48.0,19952,26.55,S
6,1,1,Andrews. Miss. Kornelia Theodosia,female,63.0,13502,77.9583,S
7,1,0,Andrews. Mr. Thomas Jr,male,39.0,112050,0.0,S
8,1,1,Appleton. Mrs. Edward Dale (Charlotte Lamson),female,53.0,11769,51.4792,S
9,1,0,Artagaveytia. Mr. Ramon,male,71.0,PC 17609,49.5042,C


In [56]:
# Replacing the missing values in the “Age” column with the mean value
titanic_data['age'].fillna(titanic_data['age'].mean(), inplace=True)

# Finding the mode value of the “Embarked” column as it will have occurred the maximum number of times
print(titanic_data['embarked'].mode())

0    S
Name: embarked, dtype: object


In [57]:
# Replacing the missing values in the “Embarked” column with mode value
titanic_data['embarked'].fillna(titanic_data['embarked'].mode()[0], inplace=True)

# Now let us check if there are still any cells remaining empty.
titanic_data.isnull().sum()

pclass      0
survived    0
name        0
sex         0
age         0
ticket      0
fare        1
embarked    0
dtype: int64

In [58]:
# Replacing the missing values in the “fare” column with the mean value
titanic_data['fare'].fillna(titanic_data['fare'].mean(), inplace=True)

In [65]:
# convert sex and embarked (embarkation point) to categorical values:
titanic_data.replace({'sex':{'male':0,'female':1}, 'embarked':{'S':0,'C':1,'Q':2}}, inplace=True)
titanic_data.head(10)

Unnamed: 0,pclass,survived,name,sex,age,ticket,fare,embarked
0,1,1,Allen. Miss. Elisabeth Walton,1,29.0,24160,211.3375,0
1,1,1,Allison. Master. Hudson Trevor,0,0.9167,113781,151.55,0
2,1,0,Allison. Miss. Helen Loraine,1,2.0,113781,151.55,0
3,1,0,Allison. Mr. Hudson Joshua Creighton,0,30.0,113781,151.55,0
4,1,0,Allison. Mrs. Hudson J C (Bessie Waldo Daniels),1,25.0,113781,151.55,0
5,1,1,Anderson. Mr. Harry,0,48.0,19952,26.55,0
6,1,1,Andrews. Miss. Kornelia Theodosia,1,63.0,13502,77.9583,0
7,1,0,Andrews. Mr. Thomas Jr,0,39.0,112050,0.0,0
8,1,1,Appleton. Mrs. Edward Dale (Charlotte Lamson),1,53.0,11769,51.4792,0
9,1,0,Artagaveytia. Mr. Ramon,0,71.0,PC 17609,49.5042,1


Machine Learning

In [66]:
# Separate the data into labels and features
y = titanic_data['survived']
X = titanic_data.drop(columns = ['name','ticket','survived'],axis=1)


# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [67]:
# dd
model = LogisticRegression()

# ff
model.fit(X_train, y_train)

# ff
X_train_prediction = model.predict(X_train)

training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  0.7961264016309888


In [69]:
# X_test and y_test:

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)



Accuracy score of test data :  0.7865853658536586


In [70]:
# Generate a confusion matrix for the model
testing_matrix = confusion_matrix(y_test, X_test_prediction)
print(testing_matrix)

[[177  26]
 [ 44  81]]


In [71]:
# Print the classification report for the model
testing_report = classification_report(y_test, X_test_prediction)
print(testing_report)

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       203
           1       0.76      0.65      0.70       125

    accuracy                           0.79       328
   macro avg       0.78      0.76      0.77       328
weighted avg       0.78      0.79      0.78       328

