### Use the preprocessed data for ML model building. 

In [36]:
# Setup
import numpy as np 
import pandas as pd 
import os
import pickle
import time

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images/ml_modeling_images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

def save_model(model, folder_path="models", file_name="untitled_model.sav"):
    pickle.dump(model, open(os.path.join(folder_path, file_name), 'wb'))

### Read in data

#### Full feature set

In [37]:
HEARTDISEASE_FULL_PATH = "dataset/afterpreprocessing/full"

heartdisease_data = pd.read_csv(os.path.join(HEARTDISEASE_FULL_PATH, f'heartDisease.csv'))
heartdisease_data.head()

Unnamed: 0,heartDisease,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
0,2,1660,1,1,2,3,30,2,2,8,1,1,1,2,5,1,2,1
1,2,2034,2,1,1,88,88,2,2,13,1,3,1,2,7,2,2,2
2,2,2658,1,1,2,20,30,2,1,10,1,1,1,4,8,1,2,2
3,2,2421,2,1,2,88,88,2,2,12,1,3,2,3,6,2,2,1
4,2,2657,1,9,2,15,10,2,2,10,2,1,1,4,6,2,2,2


In [38]:
x_train = pd.read_csv(os.path.join(HEARTDISEASE_FULL_PATH, f'heartDisease_x_train.csv'))
x_train.head()

Unnamed: 0,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
0,2986,2,1,2,30,88,1,1,8,2,3,1,5,6,2,2,2
1,2614,2,1,2,88,88,1,1,11,2,3,2,4,8,2,2,2
2,3082,2,1,2,2,88,2,1,10,2,1,1,2,8,2,1,2
3,2349,1,1,2,88,88,2,2,11,1,3,2,3,9,2,2,2
4,2561,1,1,1,30,88,1,2,8,6,3,2,4,6,1,2,2


In [39]:
y_train = pd.read_csv(os.path.join(HEARTDISEASE_FULL_PATH, f'heartDisease_y_train.csv'))
y_train.head()

Unnamed: 0,heartDisease
0,1
1,1
2,1
3,1
4,1


#### Reduced feature set

In [40]:
HEARTDISEASE_RED_PATH = "dataset/afterpreprocessing/reduced"

reduced_heartdisease_data = pd.read_csv(os.path.join(HEARTDISEASE_RED_PATH, f'heartDisease_red.csv'))
heartdisease_data.head()

Unnamed: 0,heartDisease,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
0,2,1660,1,1,2,3,30,2,2,8,1,1,1,2,5,1,2,1
1,2,2034,2,1,1,88,88,2,2,13,1,3,1,2,7,2,2,2
2,2,2658,1,1,2,20,30,2,1,10,1,1,1,4,8,1,2,2
3,2,2421,2,1,2,88,88,2,2,12,1,3,2,3,6,2,2,1
4,2,2657,1,9,2,15,10,2,2,10,2,1,1,4,6,2,2,2


In [41]:
red_x_train = pd.read_csv(os.path.join(HEARTDISEASE_RED_PATH, f'heartDisease_red_x_train.csv'))
red_x_train.head()

Unnamed: 0,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity
0,2573,2,1,2,88,88,2,1,8,1,3,1
1,3613,1,1,2,88,20,1,2,11,1,1,1
2,2568,2,1,2,88,88,2,1,6,1,3,1
3,2771,2,1,2,88,88,2,1,10,5,3,1
4,3595,1,1,2,88,88,2,1,8,1,3,2


In [42]:
red_y_train = pd.read_csv(os.path.join(HEARTDISEASE_RED_PATH, f'heartDisease_red_y_train.csv'))
red_y_train.head()

Unnamed: 0,heartDisease
0,2
1,1
2,2
3,2
4,2


### Full Feature Set With Decision Tree

In [43]:
# Training and Visualizing a Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source

MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models")
DECISION_TREE_PATH = os.path.join(MODELS_PATH, 'decision_tree')

def FullDecisionTreeModel(x_train, y_train):
    tree_clf = DecisionTreeClassifier(max_depth=5,random_state=42)
    start_time = time.time()
    tree_clf.fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=x_train.columns, class_names=str(y_train), rounded=True, filled=True))
    graph.format = 'png'
    graph.render( IMAGES_PATH + '/decision_tree/decision_tree', view=False)
    filename = 'full_decision_tree_model.sav'
    save_model(tree_clf, DECISION_TREE_PATH, filename)

FullDecisionTreeModel(x_train, y_train)


Training Time: 0.10s


### Full Feature Set with Logistic Regression
We chose Newton-Cholesky for our Logistic Regression algorithm since it works well when the dataset has more samples than features and when using binary classification.

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
LOGREG_PATH = os.path.join(MODELS_PATH, 'log_regression')

def FullLogisticRegression(x_train, y_train):
    scaler = StandardScaler()
    scaled_x = x_train.copy()
    scaler.fit(scaled_x)
    start_time = time.time()
    y_train = np.ravel(y_train)
    clf = LogisticRegression(solver='newton-cholesky', random_state=1).fit(scaled_x, y_train)
    clf.predict(scaled_x)
    clf.predict_proba(scaled_x)
    clf.score(scaled_x, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'full_logregression_model.sav'
    save_model(clf, LOGREG_PATH, filename)

FullLogisticRegression(x_train, y_train)

Training Time: 0.09s


### Full Feature Set with KNN

In [45]:
# KNN algorthm

from sklearn.neighbors import KNeighborsClassifier

KNN_PATH = os.path.join(MODELS_PATH, 'knn')

# Building a model using KNeighborsClassifier 
def FullKNN(x_train, y_train):
    start_time = time.time()
    y_train = np.ravel(y_train)
    knn = KNeighborsClassifier(n_neighbors = 5).fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'full_knn_model.sav'
    save_model(knn, KNN_PATH, filename)

FullKNN(x_train, y_train)

Training Time: 0.00s


### Full Feature Set with Gaussian Naive Bayes

In [46]:
from sklearn.naive_bayes import GaussianNB
GAUSSIANNB_PATH = os.path.join(MODELS_PATH, 'gaussianNB')

def GaussianNBModel (x_train,y_train):
    gnb = GaussianNB()
    start_time = time.time()
    y_train = np.ravel(y_train)
    gnb.fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'full_gaussianNB_model.sav'
    save_model(gnb, GAUSSIANNB_PATH, filename)

GaussianNBModel(x_train, y_train)

Training Time: 0.02s


### Full Feature Set with Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier
RANDOMFOREST_PATH = os.path.join(MODELS_PATH, 'randomforest')

def FullRandomForestModel(x_train, y_train):
    clf = RandomForestClassifier(n_estimators=100, random_state=40)
    start_time = time.time()
    y_train = np.ravel(y_train)
    clf.fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'full_randomforest_model.sav'
    save_model(clf, RANDOMFOREST_PATH, filename)

FullRandomForestModel(x_train, y_train)
    

Training Time: 7.81s


### Reduced Feature Set With Decision Tree

In [48]:

def ReducedDecisionTreeModel(x_train, y_train):
    tree_clf = DecisionTreeClassifier(max_depth=5, random_state=42)
    start_time = time.time()
    tree_clf.fit(x_train, y_train)
    graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=x_train.columns, class_names=str(y_train), rounded=True, filled=True))
    graph.format = 'png'
    graph.render( IMAGES_PATH + '/decision_tree/reduced_decision_tree', view=False)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'reduced_decision_tree_model.sav'
    save_model(tree_clf, DECISION_TREE_PATH, filename)

ReducedDecisionTreeModel(red_x_train, red_y_train)

Training Time: 0.37s


### Reduced Feature Set With Logistic Regression

In [49]:
def ReducedLogisticRegression(x_train, y_train):
    scaler = StandardScaler()
    scaled_x = x_train.copy()
    scaler.fit(scaled_x)
    start_time = time.time()
    y_train = np.ravel(y_train)
    clf = LogisticRegression(solver='newton-cholesky', random_state=1).fit(scaled_x, y_train)
    clf.predict(scaled_x)
    clf.predict_proba(scaled_x)
    clf.score(scaled_x, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'reduced_logregression_model.sav'
    save_model(clf, LOGREG_PATH, filename)

ReducedLogisticRegression(red_x_train, red_y_train)

Training Time: 0.06s


### Reduced Feature Set With KNN

In [50]:
# Building a model using KNeighborsClassifier 
def ReducedKNN(x_train, y_train):
    start_time = time.time()
    y_train = np.ravel(y_train)
    knn = KNeighborsClassifier(n_neighbors = 5).fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'reduced_knn_model.sav'
    save_model(knn, KNN_PATH, filename)

ReducedKNN(red_x_train, red_y_train)

Training Time: 0.18s


### Reduced Feature Set With Gaussian Naive Bayes

In [51]:

def ReducedGaussianNBModel (x_train,y_train):
    gnb = GaussianNB()
    start_time = time.time()
    y_train = np.ravel(y_train)
    gnb.fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'reduced_gaussianNB_model.sav'
    save_model(gnb, GAUSSIANNB_PATH, filename)

ReducedGaussianNBModel(red_x_train, red_y_train)

Training Time: 0.01s


### Reduced Feature Set with Random Forest

In [52]:

def ReducedRandomForestModel(x_train, y_train):
    clf = RandomForestClassifier(n_estimators=100, random_state=40)
    start_time = time.time()
    y_train = np.ravel(y_train)
    clf.fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'reduced_randomforest_model.sav'
    save_model(clf, RANDOMFOREST_PATH, filename)

ReducedRandomForestModel(red_x_train, red_y_train)
    

Training Time: 4.85s


## Load Models

In [53]:
full_decisiontree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "full_decision_tree_model.sav"), 'rb'))
reduced_decisiontree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "reduced_decision_tree_model.sav"), 'rb'))
full_gaussianNB_model = pickle.load(open(os.path.join(GAUSSIANNB_PATH, "full_gaussianNB_model.sav"), 'rb'))
reduced_gaussianNB_model = pickle.load(open(os.path.join(GAUSSIANNB_PATH, "reduced_gaussianNB_model.sav"), 'rb'))
full_knn_model = pickle.load(open(os.path.join(KNN_PATH, "full_knn_model.sav"), 'rb'))
reduced_knn_model = pickle.load(open(os.path.join(KNN_PATH, "reduced_knn_model.sav"), 'rb'))
full_logregression_model = pickle.load(open(os.path.join(LOGREG_PATH, "full_logregression_model.sav"), 'rb'))
reduced_logregression_model = pickle.load(open(os.path.join(LOGREG_PATH, "reduced_logregression_model.sav"), 'rb'))
reduced_randomforest_model = pickle.load(open(os.path.join(RANDOMFOREST_PATH, "reduced_randomforest_model.sav"), 'rb'))
full_randomforest_model = pickle.load(open(os.path.join(RANDOMFOREST_PATH, "full_randomforest_model.sav"), 'rb'))