In [2]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Loading data
file_path = Path("heart.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
# Renaming columns
df_loans = df_loans.rename(columns={'age':'Age', 
    'sex':'Sex', 
    'cp':'Chest Pain', 
    'trestbps':'Resting Blood Pressure', 
    'chol': 'Cholesterol',
    'fbs': 'Fasting Blood Sugar',
    'restecg': 'Resting EKG',
    'thalach': 'Max Heart Rate',
    'exang': 'Exercise Angina',
    'oldpeak': 'Exercise ST Depression',
    'slope': 'Peak ST Slope',
    'ca': 'Fluoro Vessels Occluded',
    'thal': 'Type of Defect'})
df_loans.head()

Unnamed: 0,Age,Sex,Chest Pain,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Angina,Exercise ST Depression,Peak ST Slope,Fluoro Vessels Occluded,Type of Defect,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [5]:
# Define features set
X = df_loans.copy()
X = X.drop("target", axis=1)
X.head()

Unnamed: 0,Age,Sex,Chest Pain,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Angina,Exercise ST Depression,Peak ST Slope,Fluoro Vessels Occluded,Type of Defect
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2


In [6]:
# Define the target set.
y = df_loans["target"].ravel()
y[:5]

array([0, 0, 0, 0, 0])

In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(768, 13)
(257, 13)
(768,)
(257,)


In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [10]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [11]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) 

In [13]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [14]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1])

In [15]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,126,0
Actual 1,0,131


In [16]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

1.0

In [17]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,126,0
Actual 1,0,131


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       126
           1       1.00      1.00      1.00       131

    accuracy                           1.00       257
   macro avg       1.00      1.00      1.00       257
weighted avg       1.00      1.00      1.00       257



In [18]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.08684497, 0.03861003, 0.14750267, 0.06933079, 0.08042383,
       0.01056312, 0.02216443, 0.10869306, 0.04986035, 0.11515328,
       0.04221472, 0.11727152, 0.11136724])

In [19]:
# We can sort the features by their importance.
RFV1_df = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [28]:
RFV1_df = pd.DataFrame(RFV1_df)
RFV1_df

Unnamed: 0,0,1
0,0.147503,Chest Pain
1,0.117272,Fluoro Vessels Occluded
2,0.115153,Exercise ST Depression
3,0.111367,Type of Defect
4,0.108693,Max Heart Rate
5,0.086845,Age
6,0.080424,Cholesterol
7,0.069331,Resting Blood Pressure
8,0.04986,Exercise Angina
9,0.042215,Peak ST Slope


In [64]:
# Adding column names to df
RFV1_df_new = RFV1_df.rename(columns={0: '% Risk', 1: 'Risk Factors'})
RFV1_df_new


Unnamed: 0,% Risk,Risk Factors
0,0.147503,Chest Pain
1,0.117272,Fluoro Vessels Occluded
2,0.115153,Exercise ST Depression
3,0.111367,Type of Defect
4,0.108693,Max Heart Rate
5,0.086845,Age
6,0.080424,Cholesterol
7,0.069331,Resting Blood Pressure
8,0.04986,Exercise Angina
9,0.042215,Peak ST Slope


In [65]:
# Formatting floats
RFV1_df_new['% Risk'] = RFV1_df_new['% Risk'] *100
RFV1_df_new['% Risk'] = RFV1_df_new['% Risk'].map('{:.1f}'.format)

RFV1_df_new

Unnamed: 0,% Risk,Risk Factors
0,14.8,Chest Pain
1,11.7,Fluoro Vessels Occluded
2,11.5,Exercise ST Depression
3,11.1,Type of Defect
4,10.9,Max Heart Rate
5,8.7,Age
6,8.0,Cholesterol
7,6.9,Resting Blood Pressure
8,5.0,Exercise Angina
9,4.2,Peak ST Slope


In [66]:
output_file_path = "RFV1_data.csv"
RFV1_df_new.to_csv(output_file_path, index=False)