In [1]:
import pandas as pd

In [2]:
titanic_data = pd.read_csv('datasets/titanic_train_processes.csv')
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,2,1,52.0,0,0,13.5,0,0,1
1,0,1,1,24.0,0,0,79.2,1,0,0
2,1,1,0,36.0,1,2,120.0,0,0,1
3,0,1,1,28.0,0,0,47.1,0,0,1
4,0,3,1,36.0,0,0,7.8958,0,0,1


In [3]:
titanic_data.shape

(714, 10)

In [4]:
from sklearn.model_selection import train_test_split

X = titanic_data.drop(columns='Survived', axis = 1)
Y = titanic_data['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [5]:
print(f'x_train shape: {x_train.shape}')
print(f'x_test shape: {x_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

x_train shape: (571, 9)
x_test shape: (143, 9)
y_train shape: (571,)
y_test shape: (143,)


In [6]:
from sklearn.linear_model import LogisticRegression

logisticmodel = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)

* penalty: Regularize the model by applying a penalty on overly complex models, we are using L2 reqularization here.
* Regularization, is a concept used in ML models to penalize the complex model to get the robust model for predicting the data.
* C : strength of regularization : inverse of regularization stength - smaller values indicate stronger regularization.

In [7]:
y_pred = logisticmodel.predict(x_test)

In [8]:
pred_results = pd.DataFrame({ 'y_test': y_test, 'y_pred':y_pred})
pred_results.head()

Unnamed: 0,y_test,y_pred
358,1,1
105,1,0
627,1,1
635,1,1
301,0,0


In [9]:
# we can see there are some FP, FN

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

print(f'Accuracy of the model is: {accuracy_score(y_test, y_pred) * 100:.2f}')
print(f'Precision of the model is: {precision_score(y_test, y_pred) * 100:.2f} ')
print(f'Recall of the model is: {recall_score(y_test, y_pred) * 100:.2f}')

Accuracy of the model is: 76.92
Precision of the model is: 75.51 
Recall of the model is: 63.79


* Accuracy: How many of the predicted values did teh model get right. i.e., y_pred == y_test.
* Precision: How many of the passengers that the model thought surived actually did survive. 
    <br>fewer the FP, better the model.
* Recall:  How many of the actual surviors did the model correctly predict. 
    <br>lower the FN, better the model

In [10]:
#confusion matrix
pred_actual_vales = pd.crosstab(pred_results.y_pred, pred_results.y_test)
pred_actual_vales

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,73,21
1,12,37


In [11]:
TP = pred_actual_vales[1][1]
FN = pred_actual_vales[1][0]
FP = pred_actual_vales[0][1]
TN = pred_actual_vales[0][0]
print(f'True Positive: {TP}')
print(f'False Positive: {FP}')
print(f'True Negative: {TN}')
print(f'False Negative: {FN}')


True Positive: 37
False Positive: 12
True Negative: 73
False Negative: 21


In [12]:
accuracy = ( TP + TN ) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)

print(f' Accuracy  : {accuracy * 100:.2f}')
print(f' Precision : {precision * 100:.2f}')
print(f' Recall  : {recall * 100:.2f}')

 Accuracy  : 76.92
 Precision : 75.51
 Recall  : 63.79


### apply the model on test data

In [15]:
titanic_test = pd.read_csv('datasets/titanic_test_processed.csv')
titanic_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1
