### Use the preprocessed data for ML model building. 

In [23]:
# Setup
import numpy as np 
import pandas as pd 
import os
import pickle

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images/ml_modeling_images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

def save_model(model, folder_path="models", file_name="untitled_model.sav"):
    pickle.dump(model, open(os.path.join(folder_path, file_name), 'wb'))

### Read in data

#### Full feature set

In [24]:
HEARTDISEASE_FULL_PATH = "dataset/afterpreprocessing/full"

heartdisease_data = pd.read_csv(os.path.join(HEARTDISEASE_FULL_PATH, f'heartDisease.csv'))
heartdisease_data.head()

Unnamed: 0,heartDisease,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
0,2,1660,1,1,2,3,30,2,2,8,1,1,1,2,5,1,2,1
1,2,2034,2,1,1,88,88,2,2,13,1,3,1,2,7,2,2,2
2,2,2658,1,1,2,20,30,2,1,10,1,1,1,4,8,1,2,2
3,2,2421,2,1,2,88,88,2,2,12,1,3,2,3,6,2,2,1
4,2,2657,1,9,2,15,10,2,2,10,2,1,1,4,6,2,2,2


In [25]:
x_train = pd.read_csv(os.path.join(HEARTDISEASE_FULL_PATH, f'heartDisease_x_train.csv'))
x_train.head()

Unnamed: 0,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
0,3515,1,1,2,2,88,2,1,11,1,3,1,3,7,2,1,2
1,3712,2,1,2,88,1,2,1,2,2,3,1,3,8,2,2,2
2,3193,1,1,2,88,5,2,1,6,1,3,1,3,7,2,2,2
3,1958,2,1,2,3,10,2,1,1,5,3,1,2,9,2,2,2
4,3328,2,1,2,1,14,2,2,2,1,3,1,2,8,2,2,2


In [26]:
y_train = pd.read_csv(os.path.join(HEARTDISEASE_FULL_PATH, f'heartDisease_y_train.csv'))
y_train.head()

Unnamed: 0,heartDisease
0,2
1,2
2,2
3,2
4,2


#### Reduced feature set

In [27]:
HEARTDISEASE_RED_PATH = "dataset/afterpreprocessing/reduced"

reduced_heartdisease_data = pd.read_csv(os.path.join(HEARTDISEASE_RED_PATH, f'heartDisease_red.csv'))
heartdisease_data.head()

Unnamed: 0,heartDisease,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
0,2,1660,1,1,2,3,30,2,2,8,1,1,1,2,5,1,2,1
1,2,2034,2,1,1,88,88,2,2,13,1,3,1,2,7,2,2,2
2,2,2658,1,1,2,20,30,2,1,10,1,1,1,4,8,1,2,2
3,2,2421,2,1,2,88,88,2,2,12,1,3,2,3,6,2,2,1
4,2,2657,1,9,2,15,10,2,2,10,2,1,1,4,6,2,2,2


In [28]:
red_x_train = pd.read_csv(os.path.join(HEARTDISEASE_RED_PATH, f'heartDisease_red_x_train.csv'))
red_x_train.head()

Unnamed: 0,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity
0,3515,1,1,2,2,88,2,1,11,1,3,1
1,3712,2,1,2,88,1,2,1,2,2,3,1
2,3193,1,1,2,88,5,2,1,6,1,3,1
3,1958,2,1,2,3,10,2,1,1,5,3,1
4,3328,2,1,2,1,14,2,2,2,1,3,1


In [29]:
red_y_train = pd.read_csv(os.path.join(HEARTDISEASE_RED_PATH, f'heartDisease_red_y_train.csv'))
red_y_train.head()

Unnamed: 0,heartDisease
0,2
1,2
2,2
3,2
4,2


### Full Feature Set With Decision Tree

In [30]:
# Training and Visualizing a Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source

MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models")
DECISION_TREE_PATH = os.path.join(MODELS_PATH, 'decision_tree')

def FullDecisionTreeModel(x_train, y_train):
    tree_clf = DecisionTreeClassifier(max_depth=5,random_state=42)
    tree_clf.fit(x_train, y_train)
    graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=x_train.columns, class_names=str(y_train), rounded=True, filled=True))
    graph.format = 'png'
    graph.render( IMAGES_PATH + '/decision_tree/decision_tree', view=False)
    filename = 'full_decision_tree_model.sav'
    save_model(tree_clf, DECISION_TREE_PATH, filename)

FullDecisionTreeModel(x_train, y_train)


### Full Feature Set with KNN

In [31]:
# KNN algorthm

from sklearn.neighbors import KNeighborsClassifier

KNN_PATH = os.path.join(MODELS_PATH, 'knn')

# Building a model using KNeighborsClassifier 
def FullKNNModel(x_train, y_train):
    knn = KNeighborsClassifier(n_neighbors = 5).fit(x_train, y_train)
    filename = 'full_knn_model.sav'
    save_model(knn, KNN_PATH, filename)

FullKNNModel(x_train, y_train)

  return self._fit(X, y)


### Full Feature Set with Logistic Regression

We chose Newton-Cholesky for our Logistic Regression algorithm since it works well when the dataset has more samples than features and when using binary classification.

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

LOGREG_PATH = os.path.join(MODELS_PATH, 'log_regression')

def FullLogisticRegression(x_train, y_train):
    scaler = StandardScaler()
    scaled_x = x_train.copy()
    scaler.fit(scaled_x)
    clf = LogisticRegression(solver='newton-cholesky', random_state=1).fit(scaled_x, y_train)
    clf.predict(scaled_x)
    clf.predict_proba(scaled_x)
    clf.score(scaled_x, y_train)
    filename = 'full_logregression_model.sav'
    save_model(clf, LOGREG_PATH, filename)

FullLogisticRegression(x_train, y_train)

  y = column_or_1d(y, warn=True)


### Reduced Feature Set With Decision Tree

In [33]:
def ReducedDecisionTreeModel(x_train, y_train):
    tree_clf = DecisionTreeClassifier(max_depth=5, random_state=42)
    tree_clf.fit(x_train, y_train)
    graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=input.columns, class_names=str(y_train), rounded=True, filled=True))
    graph.format = 'png'
    graph.render( IMAGES_PATH + '/decision_tree/reduced_decision_tree', view=False)
    filename = 'reduced_decision_tree_model.sav'
    save_model(tree_clf, DECISION_TREE_PATH, filename)

ReducedDecisionTreeModel(x_train, y_train)

### Reduced Feature Set With K-Means

In [42]:
from sklearn.cluster import KMeans

KMEANS_PATH = os.path.join(MODELS_PATH, 'kmeans')

heartdisease_train = pd.concat([x_train, y_train], axis=1)

kmeans_clf = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(heartdisease_train)

save_model(kmeans_clf, KMEANS_PATH, 'reduced_kmeans_model.sav')


### Reduced Feature Set with Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier
import time

RANDOMFOREST_PATH = os.path.join(MODELS_PATH, 'randomforest')

def ReducedRandomForestModel(x_train, y_train):
    clf = RandomForestClassifier(n_estimators=100, random_state=40)
    start_time = time.time()
    y_train = np.ravel(y_train)
    clf.fit(x_train, y_train)
    end_time = time.time()
    print("Training Time: {:.2f}s".format(end_time-start_time))
    filename = 'reduced_randomforest_model.sav'
    save_model(clf, RANDOMFOREST_PATH, filename)

ReducedRandomForestModel(red_x_train, red_y_train)
    

Training Time: 21.78s


## Load Models

In [46]:
full_decisiontree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "full_decision_tree_model.sav"), 'rb'))
reduced_decisiontree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "reduced_decision_tree_model.sav"), 'rb'))
reduced_kmeans_model = pickle.load(open(os.path.join(KMEANS_PATH, "reduced_kmeans_model.sav"), 'rb'))
full_knn_model = pickle.load(open(os.path.join(KNN_PATH, "full_knn_model.sav"), 'rb'))
full_logregression_model = pickle.load(open(os.path.join(LOGREG_PATH, "full_logregression_model.sav"), 'rb'))
reduced_randomforest_model = pickle.load(open(os.path.join(RANDOMFOREST_PATH, "reduced_randomforest_model.sav"), 'rb'))