In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics


Preprocessing data

In [59]:
data = pd.read_csv("/Users/tomaszkozubal/Downloads/titanic.csv")
data = data.drop(["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], axis = 1)
data['Sex'].replace(['male', 'female'],[0, 1], inplace=True)
data = data.dropna(axis = 0)
y_data = data["Survived"]
x_data = data.iloc[:, 1:]

Decision tree model

In [60]:
X_train, X_test, Y_train, Y_test = train_test_split(x_data, y_data  , test_size = 0.3, random_state = 1)
model = tree.DecisionTreeClassifier()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.82      0.85       134
           1       0.73      0.80      0.76        81

    accuracy                           0.81       215
   macro avg       0.80      0.81      0.81       215
weighted avg       0.82      0.81      0.82       215



Creating function to bootstrap from the data set

In [80]:
def bootstrap(y, x, Nboot):

    x = np.array(x)
    y = np.array(y)

    precision = []
    recall = []
    f1 = []
    x_indices = []
    y_indices = []
    test_score = []

    numb_rows = data.shape[0]

    for k in range(Nboot):
        chosen_rows = np.random.choice(numb_rows, replace=True, size=numb_rows)
        
        #resampled_stat.append(chosen_rows)

        bootstrap_sample = data.iloc[chosen_rows]
        y_chosen_data = np.array(bootstrap_sample.iloc[:,0])
        x_chosen_data = np.array(bootstrap_sample.iloc[:,1:])

        x_indices.append(x_chosen_data)
        y_indices.append(y_chosen_data)

        X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

        model = tree.DecisionTreeClassifier().fit(X_train, Y_train)

        y_pred = model.predict(X_test)

        test_score_test = accuracy_score(y_pred, Y_test)
        test_score.append(test_score_test)
        precision.append(metrics.precision_score(Y_test, y_pred, average="macro"))
        recall.append(metrics.recall_score(Y_test, y_pred, average="macro"))
        f1.append(metrics.f1_score(Y_test, y_pred, average="macro"))

    pred_df = pd.DataFrame(
        {
            "Test score": test_score,
            "Precision": precision,
            "Recall": recall,
            "F1": f1,
            "Y_Indices": y_indices,
            "X_Indices": x_indices
        })  
     
    pred_df.sort_values(by="Precision", ascending=False, inplace=True)  

    return pred_df

In [81]:
bootstrap(y_data, x_data, 1000)




Unnamed: 0,Test score,Precision,Recall,F1,Y_Indices,X_Indices
425,0.846154,0.839555,0.839555,0.839555,"[1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...","[[2.0, 1.0, 40.0, 15.75], [1.0, 0.0, 27.0, 53...."
87,0.846154,0.839555,0.839555,0.839555,"[1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, ...","[[3.0, 1.0, 27.0, 12.475], [3.0, 0.0, 42.0, 7...."
694,0.846154,0.839555,0.839555,0.839555,"[1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, ...","[[1.0, 1.0, 26.0, 78.85], [2.0, 1.0, 26.0, 26...."
573,0.846154,0.839555,0.839555,0.839555,"[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, ...","[[1.0, 0.0, 64.0, 263.0], [3.0, 0.0, 29.0, 7.0..."
952,0.846154,0.839555,0.839555,0.839555,"[0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...","[[3.0, 0.0, 26.0, 7.8875], [1.0, 1.0, 21.0, 77..."
...,...,...,...,...,...,...
283,0.818182,0.811364,0.807426,0.809216,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, ...","[[3.0, 1.0, 18.0, 7.775], [2.0, 0.0, 25.0, 13...."
309,0.818182,0.811364,0.807426,0.809216,"[0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, ...","[[1.0, 0.0, 58.0, 29.7], [3.0, 0.0, 2.0, 21.07..."
829,0.818182,0.811364,0.807426,0.809216,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, ...","[[1.0, 0.0, 0.92, 151.55], [2.0, 1.0, 13.0, 19..."
314,0.818182,0.811364,0.807426,0.809216,"[0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, ...","[[3.0, 0.0, 28.0, 9.5], [1.0, 0.0, 50.0, 55.9]..."
