### Use the preprocessed data for ML model building. 

In [31]:
# Setup
import numpy as np 
import pandas as pd 
import os
import pickle

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images/ml_modeling_images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

def save_model(model, folder_path="models", file_name="untitled_model.sav"):
    pickle.dump(model, open(os.path.join(folder_path, file_name), 'wb'))

HEARTDISEASE_PATH = "dataset/afterpreprocessing"

heartdisease_data = pd.read_csv(os.path.join(HEARTDISEASE_PATH, f'heartDisease.csv'))
heartdisease_data.head()

Unnamed: 0,heartDisease,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
0,2.0,1660.0,1.0,1,2.0,3.0,30.0,2.0,2,8,1,1.0,1,2.0,5.0,1.0,2.0,1.0
1,2.0,2034.0,2.0,1,1.0,88.0,88.0,2.0,2,13,1,3.0,1,2.0,7.0,2.0,2.0,2.0
2,2.0,2658.0,1.0,1,2.0,20.0,30.0,2.0,1,10,1,1.0,1,4.0,8.0,1.0,2.0,2.0
3,2.0,2421.0,2.0,1,2.0,88.0,88.0,2.0,2,12,1,3.0,2,3.0,6.0,2.0,2.0,1.0
4,2.0,2657.0,1.0,9,2.0,15.0,10.0,2.0,2,10,2,1.0,1,4.0,6.0,2.0,2.0,2.0


In [32]:
from sklearn.model_selection import train_test_split

features = heartdisease_data.drop(columns =['heartDisease'], axis = 1)

target = heartdisease_data['heartDisease']

x_train, x_test, y_train, y_test = train_test_split(features, target, shuffle = True, test_size = .2, random_state = 44)

x_train, y_train

(           BMI  smoking  alcoholDrinking  stroke  physicalHealth   
 337324  2146.0      2.0                1     2.0            88.0  \
 103565  2762.0      1.0                1     2.0            88.0   
 179235  2629.0      2.0                1     2.0            88.0   
 174519  2381.0      2.0                1     2.0            88.0   
 27670   1919.0      1.0                1     2.0            88.0   
 ...        ...      ...              ...     ...             ...   
 49723   4288.0      2.0                1     2.0             1.0   
 156845  2658.0      1.0                1     2.0            88.0   
 256753  3109.0      2.0                1     2.0            88.0   
 200099  1967.0      2.0                1     2.0             1.0   
 14100   3087.0      2.0                1     2.0             3.0   
 
         mentalHealth  diffWalking  sex  ageCategory  race  diabetic   
 337324          88.0          2.0    2            9     1       4.0  \
 103565           4.0     

### Full Feature Set With Decision Tree

In [43]:
# Training and Visualizing a Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source

MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models")
DECISION_TREE_PATH = os.path.join(MODELS_PATH, 'decision_tree')

def FullDecisionTreeModel(df):
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    input = df.drop(['heartDisease'], axis=1)
    target = df['heartDisease']
    tree_clf.fit(input, target)
    graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=input.columns, class_names=str(target), rounded=True, filled=True))
    graph.format = 'png'
    graph.render( IMAGES_PATH + '/decision_tree/decision_tree', view=False)
    filename = 'full_decision_tree_model.sav'
    save_model(tree_clf, DECISION_TREE_PATH, filename)

FullDecisionTreeModel(heartdisease_data)

#TODO: class not showing in nodes of tree image 

### Full Feature Set with KNN

In [34]:
# KNN algorthm

from sklearn.neighbors import KNeighborsClassifier

KNN_PATH = os.path.join(MODELS_PATH, 'knn')

# Building a model using KNeighborsClassifier 
def FullKNNModel(x_train, y_train):
    knn = KNeighborsClassifier(n_neighbors = 5).fit(x_train, y_train)
    filename = 'full_knn_model.sav'
    save_model(knn, KNN_PATH, filename)

FullKNNModel(x_train, y_train)

### Full Feature Set with Logistic Regression

We chose Newton-Cholesky for our Logistic Regression algorithm since it works well when the dataset has more samples than features and when using binary classification.

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

LOGREG_PATH = os.path.join(MODELS_PATH, 'log_regression')

def FullLogisticRegression(x_train, y_train):
    scaler = StandardScaler()
    scaled_x = x_train.copy()
    scaler.fit(scaled_x)
    clf = LogisticRegression(solver='newton-cholesky', random_state=1).fit(scaled_x, y_train)
    clf.predict(scaled_x)
    clf.predict_proba(scaled_x)
    clf.score(scaled_x, y_train)
    filename = 'full_logregression_model.sav'
    save_model(clf, LOGREG_PATH, filename)

FullLogisticRegression(x_train, y_train)

## Top Feature Selector

In [36]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def top_feature_selector(df, top_num=20):
    input = df.drop(['heartDisease'], axis=1)
    target = df['heartDisease']
    bestfeatures = SelectKBest(score_func=chi2, k=3)
    fit = bestfeatures.fit(input,target)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(input.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    print(featureScores.nlargest(top_num,'Score'))  #print 10 best features
    print('----------------------------------------------------')
    return dfcolumns

top_feature_selector(heartdisease_data, 17)

               Specs          Score
0                BMI  129421.654142
4     physicalHealth   62981.827052
8        ageCategory   31533.017009
12         genHealth    9905.312449
5       mentalHealth    4145.620903
10          diabetic    1827.704221
13         sleepTime    1289.714567
6        diffWalking     873.817489
1            smoking     599.171503
11  physicalActivity     584.413317
9               race     540.749865
7                sex     268.426303
3             stroke     185.787347
16        skinCancer     124.094219
15     kidneyDisease      71.822225
2    alcoholDrinking      45.439215
14            asthma      32.352834
----------------------------------------------------


Unnamed: 0,0
0,BMI
1,smoking
2,alcoholDrinking
3,stroke
4,physicalHealth
5,mentalHealth
6,diffWalking
7,sex
8,ageCategory
9,race


In [37]:
# Dropping irrelevant features
reduced_heartdisease_data = heartdisease_data.drop(columns=["genHealth", 
                                                    "sleepTime", 
                                                    "asthma",
                                                    "kidneyDisease",
                                                    "skinCancer"
                                                    ], axis=1)

In [38]:

red_features = reduced_heartdisease_data.drop(columns =['heartDisease'], axis = 1)

red_target = reduced_heartdisease_data['heartDisease']

red_x_train, red_x_test, red_y_train, red_y_test = train_test_split(red_features, red_target, shuffle = True, test_size = .2, random_state = 44)

### Reduced Feature Set With Decision Tree

In [39]:
def ReducedDecisionTreeModel(df):
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    input =  df.drop(columns='heartDisease')
    target = df['heartDisease']
    tree_clf.fit(input, target)
    graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=input.columns, class_names=str(target), rounded=True, filled=True))
    graph.format = 'png'
    graph.render( IMAGES_PATH + '/decision_tree/reduced_decision_tree', view=False)
    filename = 'reduced_decision_tree_model.sav'
    save_model(tree_clf, DECISION_TREE_PATH, filename)

ReducedDecisionTreeModel(reduced_heartdisease_data)

### Reduced Feature Set With K-Means

In [40]:
from sklearn.cluster import KMeans

KMEANS_PATH = os.path.join(MODELS_PATH, 'kmeans')

kmeans_clf = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(reduced_heartdisease_data)

save_model(kmeans_clf, KMEANS_PATH, 'reduced_kmeans_model.sav')


### Reduced Feature Set with Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier
import time

RANDOMFOREST_PATH = os.path.join(MODELS_PATH, 'randomforest')

def ReducedRandomForestModel(x_train, y_train):
    clf = RandomForestClassifier(n_estimators=100, random_state=40)
    start_time = time.time()
    clf.fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'reduced_randomforest_model.sav'
    save_model(clf, RANDOMFOREST_PATH, filename)

ReducedRandomForestModel(red_x_train, red_y_train)
    

Training Time: 23.78s


## Load Models

In [None]:

loaded_full_decisiontree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "Overall_Health_decision_tree_model.sav"), 'rb'))
loaded_kmeans_model = pickle.load(open(os.path.join(KMEANS_PATH, "alzheimers_kmeans_model.sav"), 'rb'))
loaded_one_class_svm_model = pickle.load(open(os.path.join(OneClassSVM_PATH, "alzheimers_one_class_svm_model.sav"), 'rb'))
loaded_isolation_forest_model = pickle.load(open(os.path.join(ISOLATION_FOREST_PATH, "Overall_Health_isolation_forest_model.sav"), 'rb'))