In [1]:
import pandas as pd

In [2]:
# Load preprocessed file
titanic_df = pd.read_csv('titanic_processed.csv')

# Check top 5 rows of the dataset
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,36.0,0,0,7.4958,0,0,1
1,1,1,0,60.0,1,0,75.25,1,0,0
2,0,3,0,45.0,0,0,7.75,0,0,1
3,0,3,1,23.0,0,0,7.8958,0,0,1
4,1,2,0,36.0,1,0,26.0,0,0,1


In [3]:
# Check the shape of the dataset
titanic_df.shape

(712, 10)

In [4]:
# import train test split for dataset
from sklearn.model_selection import train_test_split

# Extract X and y from the dataframe
X = titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']

# Obtain train and test set by applying train test split
# Here, we use 20% of data for test and 80% of data for training
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [5]:
# Get the shape of training data
x_train.shape, y_train.shape

((569, 9), (569,))

In [6]:
# Get the shape of test data
x_test.shape, y_test.shape

((143, 9), (143,))

###  Model building and Training

Here we use Logistic regression technique for the classification.

In [7]:
# Import Logistic Regression from scikit learn package
from sklearn.linear_model import LogisticRegression

# Build logistic regression model
logistic_model = LogisticRegression().fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Predict for the unseen data
y_pred = logistic_model.predict(x_test)

### Model Test and Evaluation

In [9]:
# Build dataframe containing actual and predicted value
# We do this for our ease for further analysis
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

In [10]:
pred_results.head()

Unnamed: 0,y_test,y_pred
540,0,0
407,1,1
469,0,1
664,0,1
217,0,1


In [11]:
# Construct confusion matrix through cross tab
titanic_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

# Print the confusion matrix
titanic_crosstab

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,72,15
1,15,41


### Precision-recall scores

When we use these for multiclass classification we need to specify an averaging method to determine how the precision and recall scores for different labels should be weighted

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

In [12]:
# import accuracy, precision and recall as evaluation metrics from sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [13]:
# Compute and print the accuracy, precision and recall
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)

accuracy_score :  0.7902097902097902
precision_score :  0.7321428571428571
recall_score :  0.7321428571428571


In [14]:
titanic_crosstab

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,72,15
1,15,41


In [15]:
TP = titanic_crosstab[1][1]
TN = titanic_crosstab[0][0]
FP = titanic_crosstab[0][1]
FN = titanic_crosstab[1][0]

In [16]:
accuracy_score_verified = (TP + TN) / (TP + FP + TN + FN)

accuracy_score_verified

0.7902097902097902

In [17]:
precision_score_survived = TP / (TP + FP)

precision_score_survived

0.7321428571428571

In [18]:
recall_score_survived = TP / (TP + FN)

recall_score_survived

0.7321428571428571