In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../ML/heart_cleveland_upload.csv')

In [3]:
df


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [4]:
X = df.drop('condition', axis=1)
y = df['condition']

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) 

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
def printMetrics(y_pred, y_actual):
    print('Accuracy is ' + str(accuracy_score(y_pred, y_actual)))
    print('Confusion matrix ')
    print(confusion_matrix(y_pred, y_actual))
    print('Classification Report')
    print(classification_report(y_pred, y_actual))

In [9]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)

printMetrics(pred_dt, y_test)

Accuracy is 0.8333333333333334
Confusion matrix 
[[14  0]
 [ 5 11]]
Classification Report
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        14
           1       1.00      0.69      0.81        16

    accuracy                           0.83        30
   macro avg       0.87      0.84      0.83        30
weighted avg       0.88      0.83      0.83        30



In [10]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

printMetrics(rf_pred, y_test)

Accuracy is 0.8666666666666667
Confusion matrix 
[[17  2]
 [ 2  9]]
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        19
           1       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.86      0.86      0.86        30
weighted avg       0.87      0.87      0.87        30



In [11]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_depth':[None, 2,4,6,8,10,15,5,7],
    'min_samples_split':[2,5,10,3,4,7],
    'min_samples_leaf':[1,2,3,4,10,15]}

dt = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dt, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs = -1, verbose= 2)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [12]:
grid_search.best_score_

0.7791055206149545

In [13]:
new_model =DecisionTreeClassifier(**grid_search.best_params_)
new_model.fit(X_train, y_train)

In [14]:
grid_prediction = new_model.predict(X_test)
printMetrics(grid_prediction, y_test)

Accuracy is 0.9333333333333333
Confusion matrix 
[[18  1]
 [ 1 10]]
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        19
           1       0.91      0.91      0.91        11

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



In [15]:
random_search = RandomizedSearchCV(estimator=dt, param_distributions=param_grid, n_iter=10, cv=5, scoring = 'accuracy', random_state=42, n_jobs=-1, verbose =2)
random_search.fit(X_train, y_train)
print('Best params(randomized) :', random_search.best_params_)
print('Best score(validation) :', random_search.best_score_)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best params(randomized) : {'min_samples_split': 5, 'min_samples_leaf': 10, 'max_depth': 5, 'criterion': 'gini'}
Best score(validation) : 0.7716981132075471


In [16]:
new_model2 = DecisionTreeClassifier(**random_search.best_params_)

In [18]:
random_pred = random_search.predict(X_test)
printMetrics(random_pred, y_test)

Accuracy is 0.9333333333333333
Confusion matrix 
[[18  1]
 [ 1 10]]
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        19
           1       0.91      0.91      0.91        11

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



In [5]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.0.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.1-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/150.0 MB 5.6 MB/s eta 0:00:27
    --------------------------------------- 3.4/150.0 MB 8.8 MB/s eta 0:00:17
   - -------------------------------------- 4.7/150.0 MB 8.1 MB/s eta 0:00:18
   - -------------------------------------- 6.8/150.0 MB 8.4 MB/s eta 0:00:18
   -- ------------------------------------- 9.2/150.0 MB 9.1 MB/s eta 0:00:16
   --- ------------------------------------ 11.5/150.0 MB 9.5 MB/s eta 0:00:15
   --- ------------------------------------ 13.9/150.0 MB 9.7 MB/s eta 0:00:15
   ---- ----------------------------------- 16.3/150.0 MB 10.0 MB/s eta 0:00:14
   -

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier