In [None]:
Build a random forest classifier to predict the risk of heart disease based on a dataset of patient
information. The dataset contains 303 instances with 14 features, including age, sex, chest pain type,
resting blood pressure, serum cholesterol, and maximum heart rate achieved.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
df = pd.read_csv('dataset.csv')

In [9]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [14]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['oldpeak']=encoder.fit_transform(df['oldpeak'])

In [15]:
df['oldpeak'].value_counts()

0     99
12    17
10    14
6     14
14    13
8     13
2     12
16    11
17    10
4      9
19     9
1      7
26     6
25     6
15     5
28     5
18     5
5      5
33     4
21     4
23     3
9      3
31     3
35     3
3      3
22     2
30     2
24     2
36     2
11     2
29     1
7      1
32     1
39     1
13     1
38     1
27     1
20     1
34     1
37     1
Name: oldpeak, dtype: int64

In [16]:
X=df.drop(labels=['oldpeak'],axis=1)
y=df['oldpeak']

In [17]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,0,0,1,1
1,37,1,2,130,250,0,1,187,0,0,0,2,1
2,41,0,1,130,204,0,0,172,0,2,0,2,1
3,56,1,1,120,236,0,1,178,0,2,0,2,1
4,57,0,0,120,354,0,1,163,1,2,0,2,1


In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.70,random_state=30)

In [19]:
X_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,slope,ca,thal,target
259,38,1,3,120,231,0,1,182,1,1,0,3,0
226,62,1,1,120,281,0,0,103,0,1,1,3,0
261,52,1,0,112,230,0,1,160,0,2,1,2,0
124,39,0,2,94,199,0,1,179,0,2,0,2,1
233,64,1,0,120,246,0,0,96,1,0,1,2,0


In [20]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import OneHotEncoder## handling Categorical features
from sklearn.preprocessing import StandardScaler## Feature scaling
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [21]:
## Feature Engineering Automation
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')), ##missing values
        ('scaler',StandardScaler())## feature scaling 
    ]

)

#categorical Pipeline
cat_pipeline=Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')), ## handling Missing values
                ('onehotencoder',OneHotEncoder()) ## Categorical features to numerical
                ]

            )  


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [25]:
models={
    'Random Forest':RandomForestClassifier(),
    'Decision Tree':DecisionTreeClassifier(),
    'SVC':SVC()

}

In [26]:
from sklearn.metrics import accuracy_score

In [28]:
def evaluate_model(X_train,y_train,X_test,y_test,models):
    
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        # Train model
        model.fit(X_train,y_train)

            

        # Predict Testing data
        y_test_pred =model.predict(X_test)

        # Get accuracy for test data prediction
       
        test_model_score = accuracy_score(y_test,y_test_pred)

        report[list(models.keys())[i]] =  test_model_score
            

            
    return report

In [29]:
evaluate_model(X_train,y_train,X_test,y_test,models)

{'Random Forest': 0.28169014084507044,
 'Decision Tree': 0.19718309859154928,
 'SVC': 0.3145539906103286}

In [30]:
classifier=RandomForestClassifier()

In [31]:
params={'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300],
               'criterion':['gini','entropy']
              }

In [32]:
from sklearn.model_selection import RandomizedSearchCV

In [33]:
cv=RandomizedSearchCV(classifier,param_distributions=params,scoring='accuracy',cv=5,verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.333 total time=   0.2s




[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.333 total time=   0.2s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.333 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.389 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.389 total time=   0.2s
[CV 1/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.333 total time=   0.5s
[CV 2/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.389 total time=   0.5s
[CV 3/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.333 total time=   0.5s
[CV 4/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.333 total time=   0.5s
[CV 5/5] END criterion=entropy, max_depth=3, n_estimators=300;, score=0.333 total time=   0.5s
[CV 1/5] END criterion=gini, max_depth=10, n_estimators=200;, score=0.333 total time=   0.3s
[CV 2/5] END criterion=gini, max_depth=10, n_estimat

In [34]:
cv.best_params_

{'n_estimators': 100, 'max_depth': 3, 'criterion': 'entropy'}