# Reading of Dataset

In [1]:
#Import Dataset
#==============================================
import pandas as pd
URL = "../data/processed/healthcare-dataset-stroke-data-T.csv"
df = pd.read_csv(URL)
df.head()

Unnamed: 0.1,Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,1,67.0,0,1,1,1,1,228.69,36.6,2,1
1,2,31112,1,80.0,0,1,1,1,0,105.92,32.5,1,1
2,3,60182,0,49.0,0,0,1,1,1,171.23,34.4,3,1
3,4,1665,0,79.0,1,0,1,2,0,174.12,24.0,1,1
4,5,56669,1,81.0,0,0,1,1,1,186.21,29.0,2,1


# Training

In [2]:
# Split the data into features (X) and labels (y)
X = df.drop('stroke', axis=1)
y = df['stroke']

In [3]:
# Initialize a StratifiedKFold object
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [4]:
# Initialize lists to store the evaluation metrics
accuracies = []
precisions = []
recalls = []
f1s = []

In [5]:
# Loop through the folds in the StratifiedKFold object
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the Decision Tree Classifier on the training data
    dt_clf = DecisionTreeClassifier()
    dt_clf.fit(X_train, y_train)

    # Predict the labels on the test data
    y_pred = dt_clf.predict(X_test)

    # Evaluate the performance of the Decision Tree Classifier on the test data
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Append the evaluation metrics to the lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

In [6]:
# Compute the average of the evaluation metrics across all folds
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1 = sum(f1s) / len(f1s)

In [7]:
# Print the metrics as percentages
print(f'Accuracy: {avg_accuracy * 100:.2f}%')
print(f'Precision: {avg_precision * 100:.2f}%')
print(f'Recall: {avg_recall * 100:.2f}%')
print(f'F1 Score: {avg_f1 * 100:.2f}%')

Accuracy: 99.96%
Precision: 99.53%
Recall: 99.51%
F1 Score: 99.52%


In [8]:
# Visualize the final decision tree
import graphviz
from sklearn.tree import export_graphviz
dot_data = export_graphviz(dt_clf, out_file=None, feature_names=X.columns, class_names=['Negative', 'Positive'], filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph.render("heart_strike_tree")

'heart_strike_tree.pdf'

# Saving with pickle

In [10]:
import pickle
# Dump the trained decision tree classifier with Pickle
decision_tree_pkl_filename = '../models/decision_tree_classifier.pkl'
# Open the file to save as pkl file
with open(decision_tree_pkl_filename,"wb") as f:
    pickle.dump(dt_clf,f)