## code

In [4]:
import seaborn as sns
import pandas as pd

df = pd.read_csv('dataset.csv')

In [5]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
df['target'].unique()

array([1, 0])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [9]:
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [10]:
x = df.drop(labels=['target'],axis=1)
y = df['target']

In [11]:
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
numerical_cols = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']

In [15]:
num_pipeline = Pipeline(
       steps = [
         ('imputer', SimpleImputer(strategy ='median')),
        ('scelar' , StandardScaler())
       ]
)

In [16]:
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols)
])

In [17]:
X_train=preprocessor.fit_transform(x_train)
X_test=preprocessor.transform(x_test)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [19]:
models = {
    
    'Random Forest': RandomForestClassifier(),
    'Decision Tree' : DecisionTreeClassifier(),
    'SVC' : SVC()
}

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
def evaluate_model(X_train,X_test,y_train,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        
        
        y_test_pred = model.predict(X_test)
        
        test_model_score = accuracy_score(y_test,y_test_pred)
        
        report[list(models.keys())[i]] = test_model_score
        
    return report

In [22]:
evaluate_model(X_train,X_test,y_train,y_test,models)

{'Random Forest': 0.8524590163934426,
 'Decision Tree': 0.8360655737704918,
 'SVC': 0.8688524590163934}

In [23]:
classifier = RandomForestClassifier()

In [25]:
params = {'max_depth':[3,5,10,'none'], 'n_estimators' : [100,200,300], 'criterion' : ['gini' , 'entropy'] }

In [26]:
from sklearn.model_selection import RandomizedSearchCV

In [30]:
cv = RandomizedSearchCV(classifier, param_distributions = params, cv = 5, scoring = 'accuracy', verbose = 3)

In [32]:
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=none, n_estimators=100;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=none, n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=none, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=none, n_estimators=100;, score=nan total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=none, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.816 total time=   0.4s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.837 total time=   0.4s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.792 total time=   0.4s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.792 total time=   0.4s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.771 total time=   0.4s

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 341, in fit
    self._validate_params()
  File "/opt/conda/lib/python3.10/site-packages/sklearn/base.py", line 570, in _validate_params
    validate_parameter_constraints(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
skl

In [33]:
cv.best_params_

{'n_estimators': 200, 'max_depth': 3, 'criterion': 'gini'}