In [39]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score

In [21]:
# importing datasets
log_reg_df = pd.read_csv('./data/titanic_cleaned_data_reg_model.csv')
tree_df = pd.read_csv('./data/titanic_cleaned_data_tree_models.csv')

In [22]:
log_reg_df.head()

Unnamed: 0,Survived,Pclass,Fare,Embarked,Age_interpolate,Has_Cabin,IsAlone,Sex_female
0,1,3,-0.756619,1,-1.659841,0,0,0
1,1,2,-0.470377,0,-1.618018,0,0,0
2,1,3,-0.231842,1,-1.618018,0,0,1
3,1,3,-0.231842,1,-1.618018,0,0,1
4,1,2,0.245228,0,-1.618018,0,0,0


In [23]:
tree_df.head()

Unnamed: 0,Survived,Pclass,Fare,Embarked,Age_interpolate,Has_Cabin,Family_Size,IsAlone,Sex_female,Sex_male
0,1,3,8.0,1,0,0,2,0,0,1
1,1,2,14.0,0,1,0,3,0,0,1
2,1,3,19.0,1,1,0,4,0,1,0
3,1,3,19.0,1,1,0,4,0,1,0
4,1,2,29.0,0,1,0,3,0,0,1


In [24]:
X_reg = log_reg_df.drop(['Survived'], axis = 1)
X_tree = tree_df.drop(['Survived'], axis = 1)
y_reg = log_reg_df['Survived']
y_tree = tree_df['Survived']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42) 
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(X_tree, y_tree, test_size=0.3, random_state=42) 

In [25]:
log_reg_model = LogisticRegression()
dec_tree_clf = DecisionTreeClassifier(max_depth=100, random_state=42, criterion='entropy', splitter = 'best')

In [26]:
# fitting the models with data

log_reg_model.fit(X_train_reg, y_train_reg)
dec_tree_clf.fit(X_train_tree, y_train_tree)

In [27]:
y_predict_log_reg = log_reg_model.predict(X_test_reg)
y_predict_tree = dec_tree_clf.predict(X_test_tree)

## Viewing the confusion matrix for each model

In [28]:
cm_log_reg = confusion_matrix(y_test_reg, y_predict_log_reg)
cm_log_reg


array([[133,  24],
       [ 33,  78]])

In [29]:
cm_tree = confusion_matrix(y_test_tree, y_predict_tree)
cm_tree

array([[130,  27],
       [ 32,  79]])

## Calculating accuracy for each model

In [30]:
accuracy_log_reg = accuracy_score(y_test_reg, y_predict_log_reg)
accuracy_log_reg

0.7873134328358209

In [31]:
accuracy_tree = accuracy_score(y_test_tree, y_predict_tree)
accuracy_tree

0.7798507462686567

## Calculating precision for each model

In [32]:
precision_log_reg = precision_score(y_test_reg, y_predict_log_reg)
precision_log_reg

np.float64(0.7647058823529411)

In [33]:
precision_tree = precision_score(y_test_tree, y_predict_tree)
precision_tree

np.float64(0.7452830188679245)

## Calculating recall for each model

In [34]:
recall_log_reg = recall_score(y_test_reg, y_predict_log_reg)
recall_log_reg

np.float64(0.7027027027027027)

In [35]:
recall_tree = recall_score(y_test_tree, y_predict_tree)
recall_tree

np.float64(0.7117117117117117)

## Experimenting using Random Forest Classifier as well

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_tree, y_tree, random_state=42, test_size=0.3)

In [37]:
rf = RandomForestClassifier(max_depth=150, )
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [38]:
precision_rf = precision_score(y_pred_rf, y_test)
recall_rf = recall_score(y_pred_rf, y_test)

print(f"Precision: {precision_rf}\nRecall: {recall_rf}")

Precision: 0.6936936936936937
Recall: 0.7777777777777778


# Predicting with new Data

In [40]:
tree_df.head()

Unnamed: 0,Survived,Pclass,Fare,Embarked,Age_interpolate,Has_Cabin,Family_Size,IsAlone,Sex_female,Sex_male
0,1,3,8.0,1,0,0,2,0,0,1
1,1,2,14.0,0,1,0,3,0,0,1
2,1,3,19.0,1,1,0,4,0,1,0
3,1,3,19.0,1,1,0,4,0,1,0
4,1,2,29.0,0,1,0,3,0,0,1


In [42]:
tree_df.Age_interpolate

0       0
1       1
2       1
3       1
4       1
       ..
886    80
887    80
888    80
889    80
890    80
Name: Age_interpolate, Length: 891, dtype: int64

In [45]:
new_data = np.array([[3, 8, 0, 20, 0, 0, 1, 0, 1]])

In [51]:
import warnings
warnings.filterwarnings("ignore")

In [53]:
y_pred_rf = rf.predict(new_data)
print(f'Survived or not (1: Survived, 0: Not Survived)?\nAs per model: {y_pred_rf[0]}')

Survived or not (1: Survived, 0: Not Survived)?
As per model: 0
