<a href="https://colab.research.google.com/github/Pawlik-Lukasz/Heart_model/blob/main/Heart_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>




## My dataset encompasses medical specifications of patients, including cholesterol levels, blood pressure, and types of chest pain, along with demographic information such as age and gender.
* ## Initially, I developed a basic model using decision tree analysis.
* ## Upon comparison, I observed that entropy yielded superior results.
* ## Subsequently, I pruned the decision tree, employing entropy as the criterion.
* ## I selected the optimal parameters for the forest and constructed a random forest model based on them.





In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Model with decision tree algorithm
df = pd.read_csv('heart.csv')
X = df[['cp', 'sex', 'age', 'trestbps', 'chol',
'thalach', 'fbs', 'restecg', 'exang', 'oldpeak', 'slope', 'ca', 'thal']].values
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

FileNotFoundError: [Errno 2] No such file or directory: 'heart.csv'

In [None]:
# Model fine-tuned using Gini impurity and entropy criteria
kf = KFold(n_splits=5, shuffle=True)
for criterion in ['gini', 'entropy']:
    print("Decision Tree - {}".format(criterion))
    accuracy = []
    precision = []
    recall = []
    f1 = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        dt = DecisionTreeClassifier(criterion=criterion, random_state=42)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)

        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))


    print("accuracy:", np.mean(accuracy))
    print("precision:", np.mean(precision))
    print("recall:", np.mean(recall))
    print("f1:", np.mean(f1), '\n')

# It is evident that the model using entropy criterion performed better, so we will
# use entropy as our later criterion

In [None]:
# Prune the decision tree and evaluate it, select the best model, and justify the choice
param_grid = {
 'max_depth': [5, 10, 15, 20],
 'min_samples_leaf': [1, 2, 3],
 'max_leaf_nodes': [10, 20, 35, 50]}
# I'm using entropy criterion because from the previous point I learned that
# such a model will perform better in our case
dt = DecisionTreeClassifier(criterion="entropy")
gs = GridSearchCV(dt, param_grid, scoring='f1', cv=5)
gs.fit(X, y)
print("best params:", gs.best_params_)
print("best score:", gs.best_score_)

# With these settings, the most optimal decision tree has a height of 5,
# the minimum number of leaf samples is 2
# and the maximum number of leaf nodes in the tree is 10

In [None]:
# Visualization of model made in matplotlib
plt.figure(figsize=(15, 10))
plot_tree(gs.best_estimator_, filled=True, feature_names=['cp', 'sex', 'age', 'trestbps', 'chol',
'thalach', 'fbs', 'restecg', 'exang', 'oldpeak', 'slope', 'ca', 'thal'], class_names=['0', '1'], rounded=True)

In [None]:
# model with random forest algorithm
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)
rf_model = RandomForestClassifier()

In [None]:
# Fine-tune the model using GridSearchCV and visualize the estimators
n_estimators = list(range(1, 101))
# Parameter grid to search through
param_grid = {'n_estimators': n_estimators}

gs = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
gs.fit(X_train, y_train)

# Cross-validation mean accuracy scores
scores = gs.cv_results_['mean_test_score']

# Estimators visualization
plt.plot(n_estimators, scores)
plt.xlabel("Number of Estimators")
plt.ylabel("Mean Accuracy")
plt.title("Impact of Number of Estimators on Model Accuracy")
plt.show()

# As we can see, our model levels off around 20 trees, so we can assume that
# the appropriate number of estimators is 20

In [None]:

# Select relevant features
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)
rf = RandomForestClassifier(n_estimators=20, random_state=111)
rf.fit(X_train, y_train)
feature_importances = pd.Series(rf.feature_importances_,
index=['cp', 'sex', 'age', 'trestbps', 'chol',
'thalach', 'fbs', 'restecg', 'exang', 'oldpeak', 'slope', 'ca', 'thal']).sort_values(ascending=False)

# Assuming significant features are those above the threshold of 0.05
significant_features = feature_importances[feature_importances > 0.05].index.tolist()
print(f"Significant features:\n {significant_features}")

# Feature importance plot
plt.figure(figsize=(10, 6))
feature_importances.plot(kind='bar')
plt.title("Feature Importance in Random Forest Model")
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.show()