# Data Pre-processing

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('/content/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
df = df.drop_duplicates()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trtbps    302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalachh  302 non-null    int64  
 8   exng      302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slp       302 non-null    int64  
 11  caa       302 non-null    int64  
 12  thall     302 non-null    int64  
 13  output    302 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.4 KB


In [None]:
## Split training and testing split
X = df.drop('output', axis=1)
y = df['output']

In [None]:
## Split the data with 80% of training and 20% of testing
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define a list of different numbers of estimators to try
estimator_values = [50, 100, 150, 200, 250]

# Perform cross-validation for each number of estimators
for estimators in estimator_values:
    # Initialize the Random Forest classifier with the current number of estimators
    model = RandomForestClassifier(n_estimators=estimators)

    # Perform cross-validation
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

    # Print the average accuracy across all folds for the current number of estimators
    print("Number of estimators: %d, Average Accuracy: %.4f" % (estimators, scores.mean()))


Number of estimators: 50, Average Accuracy: 0.8209
Number of estimators: 100, Average Accuracy: 0.8210
Number of estimators: 150, Average Accuracy: 0.8176
Number of estimators: 200, Average Accuracy: 0.8275
Number of estimators: 250, Average Accuracy: 0.8309


In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)

In [None]:
y_pred4=rf.predict(X_test)

In [None]:
acc4 = accuracy_score(y_test, y_pred4)*100
print("Accuracy of Random Forest: {:.2f}%".format(acc4))

Accuracy of Random Forest: 78.69%


In [None]:
recall4 = recall_score(y_test, y_pred4)
precision4 = precision_score(y_test, y_pred4)
f4 = 2*((precision4*recall4)/(precision4+recall4))
print("Random Forest Performance Metrics:")
print("Accuracy:", acc4)
print("Precision:", precision4)
print("Recall:", recall4)
print("F1-score:", f4)

Random Forest Performance Metrics:
Accuracy: 78.68852459016394
Precision: 0.75
Recall: 0.9090909090909091
F1-score: 0.821917808219178


# Prepare a predictive system for the app


In [None]:
## Let's copy an observation from the heart.csv file as an input
input_data = (52,1,2,172,199,1,1,162,0,0.5,2,0,3)

# Convert them into numpy array so it's more convenient for the model
convert_input = np.asarray(input_data)

# Convert it into pandas dataframe and provide feature names
df_input = pd.DataFrame([convert_input], columns = ['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall'])

# Make the prediction
prediction = rf.predict(df_input)

if prediction[0] == 0:
    print('You do not have heart disease.')
else:
    print('You have early signs of heart disease.')

You have early signs of heart disease.


Here is the full input directly from the dataset:
(52,1,2,172,199,1,1,162,0,0.5,2,0,3,1). The model has correctly classify this person's health status.

# Save The Model

In [None]:
from joblib import dump, load

# Save model to disk
dump(rf, 'heart_pred_model.joblib')

# Load the model from disk
loaded_model = load('heart_pred_model.joblib')

In [None]:
## Let's try again with the input from the dataset
input_data = (43,1,0,110,211,0,1,161,0,0,2,0,3)

# Convert them into numpy array so it's more convenient for the model
convert_input = np.asarray(input_data)

# Convert it into pandas dataframe and provide feature names
df_input = pd.DataFrame([convert_input], columns = ['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall'])

# Make the prediction
prediction = rf.predict(df_input)

if prediction[0] == 0:
    print('You do not have heart disease.')
else:
    print('You have early signs of heart disease.')

You have early signs of heart disease.


So the model has been saved and works successfully.

In [None]:
import xgboost as xgb
print(xgb.__version__)

import joblib
print(joblib.__version__)

import sklearn
print(sklearn.__version__)

1.7.6
1.2.0
1.2.2
