In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.shape

(887, 8)

In [4]:
df.dropna().shape

(887, 8)

In [5]:
df['Age'].fillna(-1, inplace=True)

In [6]:
X = df.drop(['Survived', 'Name'], axis=1)
X.head()

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


In [7]:
mf = {'male': 0, 'female':1}
X['Sex'] = X['Sex'].map(mf)
X.head()

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,0,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,3,0,35.0,0,0,8.05


In [8]:
y = df.Survived

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80, random_state=42)

# Logistic Regression

In [10]:
model1 = LogisticRegression(solver='liblinear', random_state=0, max_iter = 1000).fit(X_train,y_train)

In [11]:
model1.coef_

array([[-7.02124308e-01,  2.43252589e+00, -3.52591739e-02,
        -2.45111160e-01, -3.01844584e-01,  2.07964680e-03]])

In [12]:
pd.DataFrame(confusion_matrix(y_train, model1.predict(X_train)))
conf1 = confusion_matrix(y_train, model1.predict(X_train))
Sensitivity = round(conf1[1,1]/(conf1[1,1]+conf1[0,1]),2)
Specificity = round(conf1[0,0]/(conf1[0,0]+conf1[1,0]),2)
Precision = round(conf1[1,1]/(conf1[1,1]+conf1[1,0]),2)
accuracy = round((conf1[1,1]+conf1[0,0])/(conf1[1,1]+conf1[0,0] + conf1[1,0]+conf1[0,1]),2)
Sensitivity, Specificity, Precision, accuracy

(0.83, 0.83, 0.73, 0.83)

# Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier

model2= DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 10).fit(X_train, y_train)

In [14]:
model2.score(X_train, y_train)

0.8418079096045198

In [15]:
pd.DataFrame(confusion_matrix(y_train, model2.predict(X_train)))
conf2 = confusion_matrix(y_train, model2.predict(X_train))
Sensitivity = round(conf2[1,1]/(conf2[1,1]+conf2[0,1]),2)
Specificity = round(conf2[0,0]/(conf2[0,0]+conf2[1,0]),2)
Precision = round(conf2[1,1]/(conf2[1,1]+conf2[1,0]),2)
accuracy = round((conf2[1,1]+conf2[0,0])/(conf2[1,1]+conf2[0,0] + conf2[1,0]+conf2[0,1]),2)
Sensitivity, Specificity, Precision, accuracy

(0.89, 0.82, 0.69, 0.84)

# Random Forest

In [16]:
model3 = RandomForestClassifier(n_estimators = 100, random_state = 1, max_depth =5).fit(X_train,y_train)

In [17]:
pd.DataFrame(confusion_matrix(y_train, model3.predict(X_train)))
conf3 = confusion_matrix(y_train,model3.predict(X_train))
Sensitivity = round(conf3[1,1]/(conf3[1,1]+conf3[0,1]),2)
Specificity = round(conf3[0,0]/(conf3[0,0]+conf3[1,0]),2)
Precision = round(conf3[1,1]/(conf3[1,1]+conf3[1,0]),2)
accuracy = round((conf3[1,1]+conf3[0,0])/(conf3[1,1]+conf3[0,0] + conf3[1,0]+conf3[0,1]),2)
Sensitivity, Specificity, Precision, accuracy

(0.9, 0.86, 0.77, 0.88)

# Support Vector Machines

In [18]:
model4 = SVC()
model4.fit(X_train, y_train)
Y_pred = model4.predict(X_test)
acc_svc = round(model4.score(X_train, y_train) , 2)
acc_svc

0.67

# KNN

In [19]:
model5 = KNeighborsClassifier(n_neighbors = 3)
model5.fit(X_train, y_train)
Y_pred = model5.predict(X_test)
acc_knn = round(model5.score(X_train, y_train) , 2)
acc_knn

0.82

# Gaussian Naive Bayes

In [20]:
model6 = GaussianNB()
model6.fit(X_train, y_train)
Y_pred = model6.predict(X_test)
acc_gaussian = round(model6.score(X_train, y_train) , 2)
acc_gaussian

0.77

In [21]:
table = pd.DataFrame({"Model": ["Logistic Regression", "Decision Tree",
                                "Random Forest", "Support Vector Machines", "KNN","Gaussian Naive Bayes"],
                     "Accuracy Scores": ["0.83", "0.84", "0.88", "0.67", "0.82",
                                         "0.77"]})

table["Model"] = table["Model"].astype("category")
table["Accuracy Scores"] = table["Accuracy Scores"].astype("float32")

pd.pivot_table(table, index = ["Model"]).sort_values(by = 'Accuracy Scores', ascending=False)

Unnamed: 0_level_0,Accuracy Scores
Model,Unnamed: 1_level_1
Random Forest,0.88
Decision Tree,0.84
Logistic Regression,0.83
KNN,0.82
Gaussian Naive Bayes,0.77
Support Vector Machines,0.67
