In [45]:
import numpy as np 
import pandas as pd 
import sklearn

In [47]:
# This piece of code is just to avoid unnecessary warning - can be copy pasted
import os
import warnings
warnings.filterwarnings('ignore')

In [48]:
path = (r"C:\Users\Anjali.Rajvanshi\Desktop\Data\heart-disease-uci\heart.csv")
df = pd.read_csv(path)
df.shape

(303, 14)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [None]:
'''
Only 14 attributes used:
1. age - age in years
2. sex - 1 = male; 0 = female
3. cp -  chest pain type
         Value 1: typical angina
         Value 2: atypical angina
         Value 3: non-anginal pain
         Value 4: asymptomatic
4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)
5. chol - serum cholestoral in mg/dl
6. fbs - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
7. restecg - resting electrocardiographic results
             Value 0: normal
             Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
             Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
8. thalach - maximum heart rate achieved
9. exang - exercise induced angina (1 = yes; 0 = no)
10. oldpeak -  ST depression induced by exercise relative to rest
11. slope - the slope of the peak exercise ST segment
            Value 1: upsloping
            Value 2: flat
            Value 3: downsloping
12. ca - number of major vessels (0-3) colored by flourosopy
13. thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
14. target - diagnosis of heart disease (angiographic disease status)
             Value 0: < 50% diameter narrowing
             Value 1: > 50% diameter narrowing
            (in any major vessel: attributes 59 through 68 are vessels)
'''

In [6]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
# Convert the following values to ccategorical variables
# sex, cp, restecg, exang, slope, thal, target

In [49]:
df['sex']=df['sex'].astype(str)
df['cp']=df['cp'].astype(str)
df['restecg']=df['restecg'].astype(str)
df['exang']=df['exang'].astype(str)
df['slope']=df['slope'].astype(str)
df['thal']=df['thal'].astype(str)
df['target']=df['target'].astype(str)

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null object
cp          303 non-null object
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null object
thalach     303 non-null int64
exang       303 non-null object
oldpeak     303 non-null float64
slope       303 non-null object
ca          303 non-null int64
thal        303 non-null object
target      303 non-null object
dtypes: float64(1), int64(6), object(7)
memory usage: 33.3+ KB


In [52]:
# Create dummies for the object datatype except for the response variable target
df = pd.get_dummies(df, prefix=['sex', 'cp', 'restecg', 'exang', 'slope', 'thal'], 
               columns=['sex', 'cp', 'restecg', 'exang', 'slope', 'thal'])

In [54]:
df.head()
df.shape

(303, 26)

In [7]:
# We have 26 columns and 303 rows
# No missing values
# Since we are considering ensembles using tree as the base classifier, handling outliers is not necessary
# This is a classification problem

In [None]:
# Start with the model building process

In [55]:
df.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,oldpeak,ca,target,sex_0,sex_1,...,restecg_2,exang_0,exang_1,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,63,145,233,1,150,2.3,0,1,0,1,...,0,1,0,1,0,0,0,1,0,0
1,37,130,250,0,187,3.5,0,1,0,1,...,0,1,0,1,0,0,0,0,1,0
2,41,130,204,0,172,1.4,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
3,56,120,236,0,178,0.8,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0
4,57,120,354,0,163,0.6,0,1,1,0,...,0,0,1,0,0,1,0,0,1,0


In [56]:
X = df.drop('target', axis=1)
y = df[['target']]

In [14]:
from sklearn.model_selection import train_test_split

In [57]:
# split data into train and test/validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [58]:
X_train.shape

(242, 25)

In [59]:
X_test.shape

(61, 25)

In [60]:
# Adaboost

In [61]:
from sklearn.ensemble import AdaBoostClassifier

In [62]:
adaClassifier = AdaBoostClassifier()

In [63]:
adaClassifier

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [65]:
adaClassifier.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [68]:
y_pred = adaClassifier.predict_proba(X_test)[:,1]
#adaClassifier.classes_

In [69]:
from sklearn import metrics

In [70]:
metrics.roc_auc_score(y_test, y_pred)

0.9161290322580645

In [74]:
help(adaClassifier)

Help on AdaBoostClassifier in module sklearn.ensemble.weight_boosting object:

class AdaBoostClassifier(BaseWeightBoosting, sklearn.base.ClassifierMixin)
 |  AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
 |  
 |  An AdaBoost classifier.
 |  
 |  An AdaBoost [1] classifier is a meta-estimator that begins by fitting a
 |  classifier on the original dataset and then fits additional copies of the
 |  classifier on the same dataset but where the weights of incorrectly
 |  classified instances are adjusted such that subsequent classifiers focus
 |  more on difficult cases.
 |  
 |  This class implements the algorithm known as AdaBoost-SAMME [2].
 |  
 |  Read more in the :ref:`User Guide <adaboost>`.
 |  
 |  Parameters
 |  ----------
 |  base_estimator : object, optional (default=None)
 |      The base estimator from which the boosted ensemble is built.
 |      Support for sample weighting is required, as well as proper
 

In [73]:
#adaClassifier.predict_proba(X_test)

In [75]:
# Adaboost model with parameter tuning

In [76]:
# Prepare the parameter grid
# parameter grid
param_grid = {"base_estimator__max_depth" : [2, 5],
              "learning_rate": [0.1, 0.3, 0.5, 0.6, 0.8],
              "n_estimators": [200, 400, 600]}

In [93]:
param_grid

{'base_estimator__max_depth': [2, 5],
 'learning_rate': [0.1, 0.3, 0.5, 0.6, 0.8],
 'n_estimators': [200, 400, 600]}

In [77]:
from sklearn.tree import DecisionTreeClassifier

In [78]:
ABC = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())

In [79]:
from sklearn.model_selection import GridSearchCV

In [81]:
folds = 3
grid_search_ABC = GridSearchCV(estimator=ABC, cv = 3, param_grid=param_grid, scoring = 'roc_auc', verbose=3)

In [82]:
grid_search_ABC.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] base_estimator__max_depth=2, learning_rate=0.1, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  base_estimator__max_depth=2, learning_rate=0.1, n_estimators=200, score=0.867, total=   0.6s
[CV] base_estimator__max_depth=2, learning_rate=0.1, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  base_estimator__max_depth=2, learning_rate=0.1, n_estimators=200, score=0.791, total=   0.7s
[CV] base_estimator__max_depth=2, learning_rate=0.1, n_estimators=200 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV]  base_estimator__max_depth=2, learning_rate=0.1, n_estimators=200, score=0.820, total=   0.7s
[CV] base_estimator__max_depth=2, learning_rate=0.1, n_estimators=400 
[CV]  base_estimator__max_depth=2, learning_rate=0.1, n_estimators=400, score=0.865, total=   1.2s
[CV] base_estimator__max_depth=2, learning_rate=0.1, n_estimators=400 
[CV]  base_estimator__max_depth=2, learning_rate=0.1, n_estimators=400, score=0.791, total=   1.2s
[CV] base_estimator__max_depth=2, learning_rate=0.1, n_estimators=400 
[CV]  base_estimator__max_depth=2, learning_rate=0.1, n_estimators=400, score=0.829, total=   1.1s
[CV] base_estimator__max_depth=2, learning_rate=0.1, n_estimators=600 
[CV]  base_estimator__max_depth=2, learning_rate=0.1, n_estimators=600, score=0.841, total=   1.8s
[CV] base_estimator__max_depth=2, learning_rate=0.1, n_estimators=600 
[CV]  base_estimator__max_depth=2, learning_rate=0.1, n_estimators=600, score=0.784, total=   1.8s
[CV] base_estimator__max_depth=2, learning_rate=0.1

[CV]  base_estimator__max_depth=5, learning_rate=0.1, n_estimators=600, score=0.821, total=   2.0s
[CV] base_estimator__max_depth=5, learning_rate=0.1, n_estimators=600 
[CV]  base_estimator__max_depth=5, learning_rate=0.1, n_estimators=600, score=0.762, total=   1.8s
[CV] base_estimator__max_depth=5, learning_rate=0.1, n_estimators=600 
[CV]  base_estimator__max_depth=5, learning_rate=0.1, n_estimators=600, score=0.797, total=   2.2s
[CV] base_estimator__max_depth=5, learning_rate=0.3, n_estimators=200 
[CV]  base_estimator__max_depth=5, learning_rate=0.3, n_estimators=200, score=0.764, total=   0.7s
[CV] base_estimator__max_depth=5, learning_rate=0.3, n_estimators=200 
[CV]  base_estimator__max_depth=5, learning_rate=0.3, n_estimators=200, score=0.793, total=   0.7s
[CV] base_estimator__max_depth=5, learning_rate=0.3, n_estimators=200 
[CV]  base_estimator__max_depth=5, learning_rate=0.3, n_estimators=200, score=0.707, total=   0.8s
[CV] base_estimator__max_depth=5, learning_rate=0.3

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  1.9min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=DecisionTreeClassifier(class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=None,
                                                                                max_features=None,
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=1,
                                                                                

In [85]:
cv_results = pd.DataFrame(grid_search_ABC.cv_results_)


In [87]:
cv_results.loc[cv_results['rank_test_score']==1,:]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_base_estimator__max_depth,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
28,1.18641,0.019561,0.082005,0.009574,5,0.8,400,"{'base_estimator__max_depth': 5, 'learning_rat...",0.854938,0.824074,0.840635,0.839879,0.012637,1


In [88]:
ABC = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5), learning_rate=0.8, n_estimators=400)

In [89]:
ABC.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=5,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [91]:
predictions = ABC.predict_proba(X_test)[:,1]

In [92]:
metrics.roc_auc_score(y_test, predictions)

0.9064516129032258