In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# The ML Modeling Process Basics 
In this notebook, we will go through some of the basic techinques for modeling data. This is a companion workbook for the 365 Data Science course on ML Process. This notebook only foucses on implementation. Check out the course or the documentation for the in-depth explanations of each step

In this case, we will be trying to predict if we can predict a stroke from the above dataset. 

We will cover:
- Baseline creation
- Model selection
- Parameter tuning
     - manual
     - gridsearch
     - random search
     - basian optomization
- Ensemble models

imblearn Documentation: https://imbalanced-learn.org/stable/

### On the Data 
This dataset is a good representation of real world data that can have valuable impact when analyzed. We will be exploring the accuracy of different models for predicting if someone will have a stroke or not. We will first lightly explore the data, create our train, test / validation sets, then we will ceate a baseline model. To get the best results we will compare other algorithms to our basline and use various parameter tuning techniques to see which model produces the best results. At the end we will explore some ensemble models to see what produces the best results. 

The focus of this notebook is the modeling process. If you're interested in the specifics of differen machine learning algorithms, check out our other course specifically on that. 

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

## Explore the data 
We will take a very high level overview of the data in this section. We mostly want to see if there are any null values. For full workbooks on exploratory data analysis and on dealing with null values, check out these notebooks:
- EDA:
- Dealing with Null Values: 
- Dealing with Imbalanced Data: https://www.kaggle.com/kenjee/dealing-with-imbalanced-data

In [None]:
df.columns

In [None]:
#look at basic data for continuous variables 
df.describe()

In [None]:
df.describe(include=np.object)

In [None]:
#small enough number of null values we will just remove them.
df.isnull().sum()

In [None]:
# check for possible nulls in categoricals / non answers 
for i in df.select_dtypes(include=['object']).columns:
    print(df[i].value_counts())

In [None]:
#drop null values and create dummy variables 
#scale data? 
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop('id',axis=1).reset_index(drop=True)
df_final = pd.get_dummies(df_cleaned)

In [None]:
df_final.columns

In [None]:
#Create train test split 

from sklearn.model_selection import train_test_split
X = df_final.drop('stroke', axis =1)
y = df_final.loc[:,['stroke']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
#balance the data (SMOTE)
from imblearn.over_sampling import SMOTE 
smote = SMOTE(sampling_strategy =1)

X_train, y_train = smote.fit_resample(X_train,y_train)

## Creating a Basline Model
How can we tell if our machine learning models are any good? To evaluate performance, we need to benchmark against something. In this case, we will create two baslines for our model. First, we can simply look at the average of our data for a numeric value. If we were going to predict the age, we could simply guess the average age for every candidate. 

On the other hand, for a categorical variable, we could simply guess 50/50 or the ratio of the categories in the data. In this case, the stroke data is imbalanced with 3160 / 3289 samples being of the non-stroke cateogry. That means that if we guessed that everyone in the sample didn't have a stroke, we would have a 96.1% success rate. Since this data is slightly imblanaced, this would not be a good baseline for our model.

One of the most important steps that we need to take is choosing a good evluation metric. The notebook that covers specific evaluation metrics can be located here: 

Accuracy does not make sense because of the imbalanced nature of the data. For this example we will use F1 score as our model evaluation metric.

- F1 is calculated by 2*((precision*recall)/(precision+recall))

- Instead of a simple accuracy calculation which would give us a baseline of 96.1%, F1 score gives us an undefined number since both the precision and recall of a model that only predicted negatives would equal 0. 

- In this case, we want to use a simple basleline model like Naive Bayes to set our baseline based off of f1 score. You can use most models to create a baseline, but I like Naive bayes because it is quick and doesn't require much parameter tuning. (Full breakdown of Naieve Bayes in or Algorithms Course)


In [None]:
#import cross validation score
from sklearn.model_selection import cross_val_score

#import Naive Bayes Classifier 
from sklearn.naive_bayes import GaussianNB

#create classifier object
nb = GaussianNB()

#run cv for NB classifier
from sklearn.metrics import classification_report

nb_accuracy = cross_val_score(nb,X_train,y_train.values.ravel(), cv=5, scoring ='accuracy')
nb_f1 = cross_val_score(nb,X_train,y_train.values.ravel(), cv=5, scoring ='f1')

print('nb_accuracy: ' +str(nb_accuracy))
print('nb F1_Macro Score: '+str(nb_f1))
print('nb_accuracy_avg: ' + str(nb_accuracy.mean()) +'  |  lr_f1_avg: '+str(nb_f1.mean()))


#With these F1 scores, we can begin evaluating our model. While the accuracy is lower than if we only predicted 0 every time,
# our f1 score suggests we are doing a far better job of predicting stroke outcomes. 

## Model Comparison & Selection 
After we have a baseline model to compare against, we want to evaluate how other models might perform on the same data. I like to experiment with other basic models with very little paramater tuning to see what performs well. This isn't an exact science and many people may do this step differently. After we set up the models, we can begin experimenting with parameter tuning. I find that model selection and parameter tuning is often an iterative process. For an analysis like this, trying different models, changing parameters, and experimenting with new engineered features is where I find myself spending most of my time working. 

In this section we will try:
- Logistic regression
- Decision Tree
- Support Vector Machines (SVM)
- K Nearest Neighbors (KNN)

In [None]:
#Let's now experiment with a few different basic models 

## Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=32)

lr_accuracy = cross_val_score(lr,X_train,y_train.values.ravel(), cv=5, scoring ='accuracy')
lr_f1 = cross_val_score(lr,X_train,y_train.values.ravel(), cv=5, scoring ='f1')

print('lr_accuracy: ' +str(lr_accuracy))
print('lr F1_Macro Score: '+str(lr_f1))
print('lr_accuracy_avg: ' + str(lr_accuracy.mean()) +'  |  lr_f1_avg: '+str(lr_f1.mean())+'\n')

## Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

dt_accuracy = cross_val_score(dt,X_train,y_train.values.ravel(), cv=5, scoring ='accuracy')
dt_f1 = cross_val_score(dt,X_train,y_train.values.ravel(), cv=5, scoring ='f1')

print('dt_accuracy: ' +str(dt_accuracy))
print('dt F1_Macro Score: '+str(dt_f1))
print('dt_accuracy_avg: ' + str(dt_accuracy.mean()) +'  |  dt_f1_avg: '+str(dt_f1.mean())+'\n')

## SVM - Requires feature scaling (more on features scaling in this notebook: )
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, Pipeline #creates chained events 

svc = Pipeline([('scale',StandardScaler()), ('svc',SVC())])

svc_accuracy = cross_val_score(svc,X_train,y_train.values.ravel(), cv=5, scoring ='accuracy')
svc_f1 = cross_val_score(svc,X_train,y_train.values.ravel(), cv=5, scoring ='f1')

print('svc_accuracy: ' +str(svc_accuracy))
print('svc F1_Macro Score: '+str(svc_f1))
print('svc_accuracy_avg: ' + str(svc_accuracy.mean()) +'  |  svc_f1_avg: '+str(svc_f1.mean()) +'\n')


## KNN 
from sklearn.neighbors import KNeighborsClassifier


knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
knn_accuracy = cross_val_score(knn,X_train,y_train.values.ravel(), cv=5, scoring ='accuracy')
knn_f1 = cross_val_score(knn,X_train,y_train.values.ravel(), cv=5, scoring ='f1')

print('knn_accuracy: ' +str(knn_accuracy))
print('knn F1_Macro Score: '+str(knn_f1))
print('knn_accuracy_avg: ' + str(knn_accuracy.mean()) +'  |  knn_f1_avg: '+str(knn_f1.mean()))


## Model Comparison 
It looks like we chose a pretty good baseline. While it slightly underperforms all of our new models in accuracy, it outperforms all of them in F1 score which is what we care about most for this analysis. Let's look at how everything stacks up. 

|Model          | F1 Score      |
| :------------ | :-----------: |
| **Baseline Naive Bayes**  | **81.0%**     |
| K Nearest Neighbors | **97.2%**     |
| Support Vector Clf | **95.8%**     |
| Decision Tree  | **95.1%**     |
| Logistic Regression  | **94.0%**     |


While all of our models outperformed our basline, we still can do better. We can now parameter tune! That means that we make adjustments to the model parameter inputs to better compensate for our specific data. One of the drawbacks of Naive Bayes is that it has virtually no paramaters that we can tune, so our inital results are about the best we will get with it without making changes to our data. 



## Manual Feature Selection
Let's try to do some parameter tuning with a few of these models:

Let's start with K Nearest Neighbors,which has a few parameters we can adjust, one of them being the number of K. K is how many other datapoints it uses to make its classification. If k= 3 it uses it sees what the samples 3 closest neighbors is and classifies it as the most common one. If k = 5, it uses its 5 closest datapoints. Let's change the number of k and see if that changes our results. 

In [None]:
#Knn Model Comparison 

#here we will loop through and see which value of k performs the best. 

for i in range(1,20):
    knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=i))
    knn_f1 = cross_val_score(knn,X_train,y_train.values.ravel(), cv=3, scoring ='f1')
    print('K ='+(str(i)) + (': ') + str(knn_f1.mean()))

#What we find is that k=2 is the best estimator for this specific model.
#We also realize that KNN may not be the best approach here because of the imbalanced data. 
#The larger the K is, the more of the majority class will automatically be included.

## Randomized Feature Selection
Since KNN may not be the best choice, let's explore the deicision tree. Decision trees have a lot more features we can tune. We can tweak the following:
- criterion {gini, entropy, log loss}
- splitter {best, random}
- max depth {int, None}
- min_samples_split {int, None}
- min_samples_leave {int, None}
- min_weight_fraction_leaf {float}
- max_features {int, auto, sqrt, log2, None}
- max_leaf_nodes {int, None}
- min_impurity_decrease {float}
- class_weight {dict, balanced, None}
- ccp_alpha {float}

There are a lot of parameters to tune! If there are just 2 options for each one that would be 2^11, which is 2048 total configurations. In theory, there are infinate numbers of paramater configurations. How do we even get close to finding the best one? 

The answer here is randomized search. We through in all the parameters that we are interested in searching, and the model will randomly select a subset and return the one that produces the best results. 

Still, let's manually select a few paramaters we want to evaluate on and then use randomized search:
- criterion
- split strategy
- max depth
- min_samples_split
- max features

In [None]:
from sklearn.model_selection import RandomizedSearchCV

dt = DecisionTreeClassifier(random_state = 42)

features = {'criterion': ['gini','entropy'],
            'splitter': ['best','random'],
           'max_depth': [2,5,10,20,40,None],
           'min_samples_split': [2,5,10,15],
           'max_features': ['auto','sqrt','log2',None]}

rs_dt = RandomizedSearchCV(estimator = dt, param_distributions =features, n_iter =100, cv = 3, random_state = 42, scoring ='f1')

rs_dt.fit(X_train,y_train)

In [None]:
print('best stcore = ' + str(rs_dt.best_score_))
print('best params = ' + str(rs_dt.best_params_))

## GridsearchCV
With this we have improved our model f1 score from **95.1% to 96.0%**. This is a decent increase! We also narrowed down some of the features that produced good results. We may want to try a more exhaustive search this time. Gridsearch goes through all of the possible combinations within an range and returns the best outcome. 

This time, let's do an exhaustive search of a smaller number of features and see if we can improve our results even more. 

In [None]:
from sklearn.model_selection import GridSearchCV


features_gs = {'criterion': ['entropy'],
            'splitter': ['random'],
           'max_depth': np.arange(30,50,1), #getting more precise within range
           'min_samples_split': [2,3,4,5,6,7,8,9],
           'max_features': [None]}

gs_dt = GridSearchCV(estimator = dt, param_grid =features_gs, cv = 3, scoring ='f1') #we don't need random state because there isn't randomization like before

gs_dt.fit(X_train,y_train)

In [None]:
print('best stcore = ' + str(gs_dt.best_score_))
print('best params = ' + str(gs_dt.best_params_))

#looks like we can  do a little better with this gridsearch! 

## Bayesian Optimization
I wonnder if we can do better than the funnel approach that we took with random search and gridsearch. What if we used a slightly smarter algorithm to help evaluate our features. Maybe we could explore all of the variables from the previous examples and see if our model missed something. This is where Bayesian Optimization comes in. This is an iterative process where our model improves its understandings of the feature inputs as it goes. (Full breakdown in the video portion of the course)

Now let's try to use this with a larger feature set on the SVC classifier. This won't guarantee a better result as it still is not an exahustive search, but in theory it let's us cover ground in a more efficient way. 

In [None]:
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold

In [None]:

# Scale data first 
svc_est = SVC()

# Choose cross validation method 
cv = StratifiedKFold(n_splits = 3)

#scale data 
bscaler = StandardScaler().fit(X_train)
x_train_scaled = bscaler.transform(X_train)

from skopt import BayesSearchCV

features_svc = {
        'C': (1e-3, 1e+3, 'log-uniform'), #svc__ lets us choose the palce to pass parameter in pipeline above
        'gamma': (1e-3, 1e+1, 'log-uniform'),
        'kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
        'degree': (1, 8),  # integer valued parameter
    }

bs_svc = BayesSearchCV(svc_est,features_svc, random_state=42, n_iter= 10, cv= cv, scoring ='f1')
 
bs_svc.fit(x_train_scaled,y_train.values.ravel())

In [None]:
print('best stcore = ' + str(bs_svc.best_score_))
print('best params = ' + str(bs_svc.best_params_))

#while this didn't outperform our gridsearch, it is still a good approach to try when dealing with many different feature options. 
#it still did outperform our originial random search. 95.8 to 96.4%!

## Selecting a Model
We still haven't been able to do better than our baseline. In most cases, we to tune multiple different models until we reach one that performs the best based on our evaluation criteria. We also want to use other considerations like training time, prediction time, prediction time or interperetability to select selct the best model for our use case. 

Since we have one tuned model, lets see if we can improve it by combining it with a few of the other models we have used. This process is called ensembling. In the case of classification, we often use a popular vote metric to select the best model. 

Let's see if an ensemble model of these three classifiers outperforms our baseline model. 

In [None]:
from sklearn.ensemble import VotingClassifier

dt_voting = DecisionTreeClassifier(**{'criterion': 'entropy', 'max_depth': 30, 'max_features': None, 'min_samples_split': 6, 'splitter': 'random', 'random_state' : 42}) # ** allows you to pass in parameters as dict
knn_voting = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=2))
svc_voting = make_pipeline(StandardScaler(), SVC(**{'C':1.8214548318355843,'gamma':0.09679809588492402,'kernel':'rbf'}))

ens = VotingClassifier(estimators = [('dt', dt_voting), ('knn', knn_voting), ('svc',svc_voting)], voting = 'hard')


In [None]:
voting_accuracy = cross_val_score(ens,X_train,y_train.values.ravel(), cv=5, scoring ='accuracy')
voting_f1 = cross_val_score(ens,X_train,y_train.values.ravel(), cv=5, scoring ='f1')

print('voting_accuracy: ' +str(voting_accuracy))
print('voting F1_Macro Score: '+str(voting_f1))
print('voting_accuracy_avg: ' + str(voting_accuracy.mean()) +'  |  voting_f1_avg: '+str(voting_f1.mean()))

## Stacked classifier 
In the case of the voting classifer, we didn't get better performance than our baseline model. Let's now try another type of ensembling called stacking. With stacking, we use the outputs of each of our individual models as features into a new model. In this case, where we have a decision tree, a naive baayes classifier, and a svc classifier, these will be the three features that a new model predicts on. 

Let's try running these three through a Naive Bayes Classifier and see what the results look like. 

In [None]:
from sklearn.ensemble import StackingClassifier

ens_stack = StackingClassifier(estimators = [('dt', dt_voting), ('knn', knn_voting), ('svc',svc_voting)], final_estimator = LogisticRegression())

In [None]:
stack_accuracy = cross_val_score(ens_stack,X_train,y_train.values.ravel(), cv=5, scoring ='accuracy')
stack_f1 = cross_val_score(ens_stack,X_train,y_train.values.ravel(), cv=5, scoring ='f1')

print('stacking_accuracy: ' +str(stack_accuracy))
print('stacking F1_Macro Score: '+str(stack_f1))
print('stacking_accuracy_avg: ' + str(stack_accuracy.mean()) +'  |  stack_f1_avg: '+str(stack_f1.mean()))

#in this case it didn't outperfrom, but it often does.

## Ensemble Models
The last main type of ensemble approach that we see is one that is designed that way algorithmically. Typically, random forest or gradient boosted models have ensembling built into their implementation. Let's explor random forest and see how this approach works for our data. (We have a breakdown of the main ensembling techniques in our full course on algorithms). These algorithms leverage multiple decision trees to either vote or give pass information on to subsequent models. 

In [None]:
from sklearn.ensemble import RandomForestClassifier

#first let's try a non-tuned implementation 
rf = RandomForestClassifier()

rf_accuracy = cross_val_score(rf,X_train,y_train.values.ravel(), cv=5, scoring ='accuracy')
rf_f1 = cross_val_score(rf,X_train,y_train.values.ravel(), cv=5, scoring ='f1')

In [None]:
print('rf_accuracy: ' +str(rf_accuracy))
print('rf F1_Macro Score: '+str(rf_f1))
print('rf_accuracy_avg: ' + str(rf_accuracy.mean()) +'  |  rf_f1_avg: '+str(rf_f1.mean()))

#of course, you can tune this model like the others! 

In [None]:
from sklearn.metrics import f1_score

nb.fit(X_train,y_train.values.ravel())
ens.fit(X_train,y_train.values.ravel())
dt_voting.fit(X_train,y_train.values.ravel())
ens_stack.fit(X_train,y_train.values.ravel())
rf_est = RandomForestClassifier()
rf_est.fit(X_train,y_train.values.ravel())

nb_pred = nb.predict(X_test)
ens_pred = ens.predict(X_test)
dt_pred = dt_voting.predict(X_test)
ens_stack_pred = ens_stack.predict(X_test)
rf_pred = rf_est.predict(X_test)

print(f1_score(y_test,nb_pred))
print(f1_score(y_test,ens_pred))
print(f1_score(y_test,dt_pred))
print(f1_score(y_test,ens_stack_pred))
print(f1_score(y_test,rf_pred))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, roc_curve, precision_recall_curve, auc, confusion_matrix, roc_auc_score

cf_matrix = confusion_matrix(y_test, rf_pred)
plt.title('Confusion Matrix:')
sns.heatmap(cf_matrix, annot = True, fmt = 'g', cmap = sns.cubehelix_palette(as_cmap=True))

In [None]:
# make stacking classifier 
# make random forest classifier 
# record video 
# test on test set 

In [None]:
y_train.value_counts()

In [None]:

## Ensemble 
## Model Evaluation 
