In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x = df.drop(columns = ['output'],axis = 1)

In [5]:
y = df['output']

In [6]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size = 0.3,random_state = 101)

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
clf = RandomForestClassifier(n_estimators = 100,max_depth = 4,max_features = 3,bootstrap = True,random_state = 1)

In [9]:
model = clf.fit(X_train,Y_train)

# Features Importance

In [65]:
print(sorted(zip(model.feature_importances_,X_train.columns),reverse = True)[0:10])
print(df.columns)

[(0.1956905110385614, 'cp'), (0.13102800576886103, 'oldpeak'), (0.12327003246572837, 'thall'), (0.1105138238050897, 'caa'), (0.09803750315897482, 'thalachh'), (0.07229441235387528, 'age'), (0.06116916082120443, 'exng'), (0.060550225710073666, 'slp'), (0.04954731031750216, 'chol'), (0.04522517636608698, 'trtbps')]
Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')


In [10]:
Y_pred = model.predict(X_test)

In [11]:
Y_pred

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0], dtype=int64)

In [40]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score

c_m = confusion_matrix(Y_test,Y_pred)
precision,recall,fscore,support = score(Y_test,Y_pred,pos_label = 1,average = 'binary')
print("Confusion_matrix: {}".format(c_m))
print("precision: {}".format(precision))
print('recall: {}'.format(recall))
print('Accuracy: {}'.format((Y_test==Y_pred).sum()/len(Y_test)))

Confusion_matrix: [[36  8]
 [ 6 41]]
precision: 0.8367346938775511
recall: 0.8723404255319149
Accuracy: 0.8461538461538461


# GridSearchCv

In [44]:
grid = {
    'n_estimators':[100,150,200,250,300,350,400,450,500],
    'max_features':['sqrt','log2'],
    'max_depth':[4,5,6,7,8],
    'criterion':['gini','entropy'],
    'random_state':[18]
}

In [47]:
from sklearn.model_selection import GridSearchCV

In [50]:
from sklearn.model_selection import KFold

k_fold = KFold(n_splits = 5)

In [57]:
rf_cv = GridSearchCV(estimator = RandomForestClassifier(),param_grid = grid,cv = k_fold,n_jobs = -1)
rf_cv.fit(X_train,Y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': [100, 150, 200, 250, 300, 350, 400,
                                          450, 500],
                         'random_state': [18]})

In [52]:
rf_cv.best_params_

{'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 'sqrt',
 'n_estimators': 150,
 'random_state': 18}

In [54]:
prediction=rf_cv.predict(X_test)
prediction

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0], dtype=int64)

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,prediction)

0.8351648351648352

In [59]:
from sklearn.model_selection import cross_val_score
cross_val_score(rf_cv,x,y,cv = k_fold)

array([0.7704918 , 0.67213115, 0.85245902, 0.7       , 0.51666667])

In [22]:
from sklearn import metrics
print(dir(metrics))

['ConfusionMatrixDisplay', 'DetCurveDisplay', 'PrecisionRecallDisplay', 'RocCurveDisplay', 'SCORERS', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_base', '_classification', '_pairwise_fast', '_plot', '_ranking', '_regression', '_scorer', 'accuracy_score', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'auc', 'average_precision_score', 'balanced_accuracy_score', 'brier_score_loss', 'calinski_harabasz_score', 'check_scoring', 'classification_report', 'cluster', 'cohen_kappa_score', 'completeness_score', 'confusion_matrix', 'consensus_score', 'coverage_error', 'davies_bouldin_score', 'dcg_score', 'det_curve', 'euclidean_distances', 'explained_variance_score', 'f1_score', 'fbeta_score', 'fowlkes_mallows_score', 'get_scorer', 'hamming_loss', 'hinge_loss', 'homogeneity_completeness_v_measure', 'homogeneity_score', 'jaccard_score', 'label_ranking_average_precision_score', 'label_ranking_loss', 'log_los