In [1]:
# data wrangling:
import pandas as pd
import numpy as np

# ML stuff
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier, plot_importance
from tpot import TPOTClassifier

# plotting and images:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import Image

# Import warnings
import warnings
warnings.filterwarnings('ignore')

---
### Import data:

In [2]:
heart_df = pd.read_csv('data/heart_disease.csv')
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


#### Inspect for missing values:

In [3]:
heart_df[heart_df.isna().any(axis=1)]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target


No missing values!

---
### Remove target column and split into train/test datasets:

In [4]:
# set target:
y = heart_df['target']
x = heart_df.iloc[:,:-1]

# split:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2)

---
### Train Decision Tree classifier:

In [5]:
# set model:
model = DecisionTreeClassifier(random_state=2)

# define function:
def fit_and_evaluate_model(classifier, predictors_df, truth_df, num_splits):
    # define model type:
    model = classifier
    
    # fit and evaluate:
    scores = cross_val_score(model, predictors_df, truth_df, cv=num_splits)
    
    # print scores:
    print(f'Accuracy: {np.round(scores, 2)}')
    print(f'Accuracy mean: {np.round(scores.mean(), 2)}')
    
# fit and evaluate:
fit_and_evaluate_model(classifier=model, predictors_df=x, truth_df=y, num_splits=5)

Accuracy: [0.74 0.85 0.77 0.73 0.7 ]
Accuracy mean: 0.76


---
### Define and implement Random Search CV for hyperparameter tuning:

In [6]:
def randomized_search_clf(params,  x_train, x_test, y_train, y_test, runs=20, clf=DecisionTreeClassifier(random_state=2)):
    # random search:
    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, cv=5, n_jobs=-1, random_state=2)
    
    #fit:
    rand_clf.fit(x_train, y_train)
    
    # get best model:
    best_model = rand_clf.best_estimator_
    
    # best score:
    best_score = rand_clf.best_score_
    print(f'Training score: {best_score}')
    
    # predictions
    preds = best_model.predict(x_test)
    
    # get accuracy:
    accuracy = accuracy_score(y_test, preds)
    print(f'Test score: {accuracy}')
    
    return best_model    

#### Define search params dictionary:

In [7]:
search_params = {'criterion':['entropy', 'gini'],
                 'splitter':['random', 'best'],
                  'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01],
                  'min_samples_split':[2, 3, 4, 5, 6, 8, 10],
                  'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],
                  'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
                  'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
                  'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
                  'max_depth':[None, 2,4,6,8],
                  'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]
                 }

#### Call search function:

In [8]:
# first run:
randomized_search_clf(search_params,  x_train, x_test, y_train, y_test, runs=20, clf=DecisionTreeClassifier(random_state=2))

Training score: 0.7977777777777777
Test score: 0.8552631578947368


DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=0.8,
                       max_leaf_nodes=45, min_samples_leaf=0.04,
                       min_samples_split=10, min_weight_fraction_leaf=0.05,
                       random_state=2)

In [9]:
# second run
search_params2 = {'max_depth':[None, 6, 7],
                    'max_features':['auto', 0.78],
                    'max_leaf_nodes':[45, None],
                    'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],
                    'min_samples_split':[2, 9, 10],
                    'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],
                    }

In [10]:
randomized_search_clf(search_params2,  x_train, x_test, y_train, y_test, runs=100, clf=DecisionTreeClassifier(random_state=2))

Training score: 0.8022222222222222
Test score: 0.868421052631579


DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2)

**Note: ** as we see above, the second run with the new parameters performs better than the 1st run, so we'll choose this model and run it in our crossvalidation function.

---
### Run the champion model on the CV function:

In [11]:
# define new model:
champ = DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2)

# fit and evaluate:
fit_and_evaluate_model(classifier=champ, predictors_df=x, truth_df=y, num_splits=5)

Accuracy: [0.82 0.9  0.8  0.8  0.78]
Accuracy mean: 0.82


---
### Fit the champion model on the entire set and get feature importances:

In [12]:
# fit:
champ.fit(x, y)

DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2)

In [13]:
# get feature importances:
champ.feature_importances_

array([0.04826754, 0.04081653, 0.48409586, 0.00568635, 0.        ,
       0.        , 0.        , 0.00859483, 0.        , 0.02690379,
       0.        , 0.18069065, 0.20494446])

In [14]:
# zip the importances to the feature names:
feature_dict = dict(zip(x.columns, champ.feature_importances_))

In [15]:
# import operator
import operator

# sort dict by values (as list of tuples)
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.4840958610240171),
 ('thal', 0.20494445570568706),
 ('ca', 0.18069065321397942)]

**Note:** the numbers above account for the percent of variance, so 'cp' accounts for 48% of the observed variance

&nbsp;

---
### Auto ML with TPOT:

In [16]:
pipeline_optimizer = TPOTClassifier(
    scoring='accuracy',
    max_time_mins=360,
    random_state=42,
    verbosity=2
)

pipeline_optimizer.fit(x_train, y_train)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8106280193236716

Generation 2 - Current best internal CV score: 0.8236714975845411

Generation 3 - Current best internal CV score: 0.8236714975845411

Generation 4 - Current best internal CV score: 0.8236714975845411

Generation 5 - Current best internal CV score: 0.8236714975845411

Generation 6 - Current best internal CV score: 0.8236714975845411

Generation 7 - Current best internal CV score: 0.8236714975845411

Generation 8 - Current best internal CV score: 0.8285024154589372

Generation 9 - Current best internal CV score: 0.8285024154589372

Generation 10 - Current best internal CV score: 0.8285024154589372

Generation 11 - Current best internal CV score: 0.8285024154589372

Generation 12 - Current best internal CV score: 0.8285024154589372

Generation 13 - Current best internal CV score: 0.8285024154589372

Generation 14 - Current best internal CV score: 0.8285024154589372

Generation 15 - Current best internal CV score: 0.828502

TPOTClassifier(max_time_mins=360, random_state=42, scoring='accuracy',
               verbosity=2)

&nbsp;

#### Obtain predictions and score:

In [17]:
tpot_preds = pipeline_optimizer.predict(x_test)
accuracy_score(y_test, tpot_preds)

from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, tpot_preds))

0.8421052631578947
