# Install Dependent Libraries (Databricks)

Note: If running this notebook in Databricks, you will need the following libraries. If these libraries are not installed on your Databricks Cluster, you can simply uncomment and run the following cell to install those libraries in the notebook before you import the dependencies.

Libraries needed:
- koalas
- mlflow
- tensorflow
- imblearn

In [1]:
# dbutils.library.installPyPI("koalas")
# dbutils.library.installPyPI("mlflow")
# dbutils.library.installPyPI("tensorflow")
# dbutils.library.installPyPI("imblearn")
# dbutils.library.restartPython()

# Import Dependencies

In [2]:
# import databricks.koalas as ks
import pandas as pd

import numpy as np
import gzip

In [3]:
import mlflow.sklearn

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.preprocessing import StandardScaler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [7]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, utils

# Create a Keras model that's compatible with scikit-learn
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Connecting to the AWS S3 Mount

# Read the CSV and Perform Basic Data Cleaning

In [9]:
# # Read CSVs
# df = pd.read_csv("/dbfs/mnt/%s/Project 3 Stuff/cod_clean.csv.gz" % MOUNT_NAME, compression="gzip")

In [10]:
# Read CSVs
df = pd.read_csv("../data/cod_clean.csv.gz", compression="gzip")

# Select your Features and Labels

In [11]:
# Drop unnecessary column
df = df.drop(columns="ICD Code")
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Year,Cause of Death,Race,Cause of Death Category
0,8th grade or less,June,M,85 years and over,Married,2005,All other forms of chronic ischemic heart dise...,White,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,2005,Other chronic obstructive pulmonary disease,White,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,2005,"Of trachea, bronchus and lung",White,Neoplasms
3,high school graduate or GED completed,January,M,55 - 64 years,Married,2005,Intentional self-harm,White,External causes of morbidity and mortality
4,high school graduate or GED completed,January,M,75 - 84 years,Married,2005,"Stroke, not specified as hemorrhage or infarct...",White,Diseases of the circulatory system


In [12]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Year,Cause of Death,Race,Cause of Death Category
0,8th grade or less,June,M,85 years and over,Married,2005,All other forms of chronic ischemic heart dise...,White,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,2005,Other chronic obstructive pulmonary disease,White,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,2005,"Of trachea, bronchus and lung",White,Neoplasms
3,high school graduate or GED completed,January,M,55 - 64 years,Married,2005,Intentional self-harm,White,External causes of morbidity and mortality
4,high school graduate or GED completed,January,M,75 - 84 years,Married,2005,"Stroke, not specified as hemorrhage or infarct...",White,Diseases of the circulatory system


In [13]:
cleanup_df = df[["Cause of Death", "Cause of Death Category", "Year", "Month of Death", "Sex/Gender",
                        "Marital Status", "Age Groups", "Education Level", "Race"]]

In [14]:
cleanup_df = cleanup_df.loc[(cleanup_df["Cause of Death"] == "Other cerebrovascular diseases and their sequelae ") | 
                            (cleanup_df["Cause of Death"] == "All other diseases of respiratory system ") |
                            (cleanup_df["Cause of Death Category"] == "External causes of morbidity and mortality") |
                            (cleanup_df["Cause of Death"] == "Alzheimer's disease ") |
                            (cleanup_df["Cause of Death"] == "Diabetes mellitus ") |
                            (cleanup_df["Cause of Death"] == "All other symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified ")
                            ]
cleanup_df = cleanup_df.reset_index(drop=True)

In [15]:
cleanup_df = cleanup_df.replace({
    "Endocrine, nutritional and metabolic diseases": "Diabetes mellitus",
    "Diseases of the nervous system": "Alzheimer's Disease",
    "Diseases of the circulatory system": "Cerebrovascular Diseases",
    "Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified": "Other"
})

In [16]:
selected_features = cleanup_df[["Cause of Death Category", "Year", "Month of Death", "Sex/Gender",
                        "Marital Status", "Age Groups", "Education Level", "Race"]]
selected_features.head()

Unnamed: 0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
0,External causes of morbidity and mortality,2005,January,M,Married,55 - 64 years,high school graduate or GED completed,White
1,Cerebrovascular Diseases,2005,January,F,Widowed,75 - 84 years,8th grade or less,White
2,Alzheimer's Disease,2005,January,M,Widowed,85 years and over,"some college credit, but no degree",White
3,Alzheimer's Disease,2005,January,M,Married,75 - 84 years,"some college credit, but no degree",White
4,Diabetes mellitus,2005,January,M,Married,65 - 74 years,Master’s degree,White


In [17]:
for x in selected_features["Cause of Death Category"].unique():
    print(x)

External causes of morbidity and mortality
Cerebrovascular Diseases
Alzheimer's Disease
Diabetes mellitus
Diseases of the respiratory system
Other


# Select Labels for Test and Control Groups

In [18]:
selected_features = selected_features.loc[(selected_features["Cause of Death Category"] == "Diabetes mellitus") |
                                         (selected_features["Cause of Death Category"] == "Other")]
selected_features = selected_features.reset_index(drop=True)

In [19]:
selected_features

Unnamed: 0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
0,Diabetes mellitus,2005,January,M,Married,65 - 74 years,Master’s degree,White
1,Diabetes mellitus,2005,January,M,Divorced,45 - 54 years,Associate degree,White
2,Diabetes mellitus,2005,January,F,Widowed,85 years and over,Associate degree,White
3,Diabetes mellitus,2005,January,M,Married,65 - 74 years,Associate degree,White
4,Diabetes mellitus,2005,January,F,Widowed,75 - 84 years,"some college credit, but no degree",White
...,...,...,...,...,...,...,...,...
675674,Diabetes mellitus,2015,December,M,Widowed,75 - 84 years,8th grade or less,White
675675,Diabetes mellitus,2015,December,M,"Never married, single",55 - 64 years,high school graduate or GED completed,White
675676,Diabetes mellitus,2015,December,F,Widowed,55 - 64 years,8th grade or less,Black
675677,Diabetes mellitus,2015,December,M,Married,55 - 64 years,Bachelor’s degree,Black


# Preview Data Distribution by Features and Classes

In [20]:
cod = selected_features.groupby("Cause of Death Category").count()
cod.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
Cause of Death Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Diabetes mellitus,562396,562396,562396,562396,562396,562396,562396
Other,113283,113283,113283,113283,113283,113283,113283


In [21]:
age = selected_features.groupby("Age Groups").count()
age.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Education Level,Race
Age Groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
85 years and over,185126,185126,185126,185126,185126,185126,185126
75 - 84 years,178279,178279,178279,178279,178279,178279,178279
65 - 74 years,137228,137228,137228,137228,137228,137228,137228
55 - 64 years,100060,100060,100060,100060,100060,100060,100060
45 - 54 years,47949,47949,47949,47949,47949,47949,47949
35 - 44 years,16604,16604,16604,16604,16604,16604,16604
25 - 34 years,6676,6676,6676,6676,6676,6676,6676
15 - 24 years,2573,2573,2573,2573,2573,2573,2573
5 - 14 years,518,518,518,518,518,518,518
Under 1 year (includes not stated infant ages),379,379,379,379,379,379,379


In [22]:
race = selected_features.groupby("Race").count()
race.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level
Race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
White,546060,546060,546060,546060,546060,546060,546060
Black,102629,102629,102629,102629,102629,102629,102629
Asian or Pacific Islander,18885,18885,18885,18885,18885,18885,18885
American Indian,8105,8105,8105,8105,8105,8105,8105


# Apply One-Hot Encoding

In [23]:
label_encoder = LabelEncoder()
onehotencoder = OneHotEncoder()

### Encode X data (features)

In [24]:
column_list = ["Month of Death", "Age Groups", "Education Level", "Sex/Gender", "Marital Status", "Race"]

In [25]:
for column in column_list:
    # Reshape column data; fit to the one-hot-encoder (expands columns)
    X = onehotencoder.fit_transform(selected_features[column].values.reshape(-1,1)).toarray()
    
    # Send the one-hot-encoded information from that column to a new dataframe
    dfOneHot = pd.DataFrame(X, columns = [column+str(int(i)) for i in range(X.shape[1])])
    
    # Merge the one-hot-encoded dataframe to the master dataframe
    selected_features = selected_features.merge(dfOneHot, how="right", right_index=True, left_index=True)
    
    # Drop the column selected (no longer needed)
    selected_features = selected_features.drop([column], axis=1)

selected_features.head()

Unnamed: 0,Cause of Death Category,Year,Month of Death0,Month of Death1,Month of Death2,Month of Death3,Month of Death4,Month of Death5,Month of Death6,Month of Death7,...,Sex/Gender1,Marital Status0,Marital Status1,Marital Status2,Marital Status3,Marital Status4,Race0,Race1,Race2,Race3
0,Diabetes mellitus,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Diabetes mellitus,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Diabetes mellitus,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,Diabetes mellitus,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Diabetes mellitus,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Encode Y data (labels/categories)

In [26]:
selected_features["Cause of Death Category"] = label_encoder.fit_transform(selected_features["Cause of Death Category"])
selected_features.head()

Unnamed: 0,Cause of Death Category,Year,Month of Death0,Month of Death1,Month of Death2,Month of Death3,Month of Death4,Month of Death5,Month of Death6,Month of Death7,...,Sex/Gender1,Marital Status0,Marital Status1,Marital Status2,Marital Status3,Marital Status4,Race0,Race1,Race2,Race3
0,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Select Data Values

In [27]:
data = selected_features.values
X = data[:, 1:]
y = data[:, 0]


In [28]:
X.shape

(675679, 44)

# Make a Deep Learning Classifier

In [29]:
def build_DL_classifier():
    classifier = models.Sequential()
    number_inputs = 44
    first_hidden_layer = 60
    second_hidden_layer = 30
    third_hidden_layer = 24
    fourth_hidden_layer = 12
    number_classes = 2
    
    classifier.add(layers.Dense(units=first_hidden_layer, activation='relu', input_dim=number_inputs))
    classifier.add(layers.Dense(units=second_hidden_layer, activation='relu'))
    classifier.add(layers.Dense(units=third_hidden_layer, activation='relu'))
    classifier.add(layers.Dense(units=fourth_hidden_layer, activation='relu'))
    classifier.add(layers.Dense(units=number_classes, activation='softmax'))
    classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return classifier

# keras_DL_classifier = KerasClassifier(build_DL_classifier, epochs=150, shuffle=True, verbose=2, callbacks=[EarlyStopping(monitor='accuracy', patience=20, verbose=2)])
keras_DL_classifier = KerasClassifier(build_DL_classifier, epochs=150, shuffle=True, verbose=2, callbacks=[EarlyStopping(monitor='loss', patience=20, verbose=2)])

# Create a Train Test Split

In [30]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Preprocess the Data and Adjust Imbalanced Data

In [31]:
deep_model = make_pipeline_imb(StandardScaler(), RandomUnderSampler(), keras_DL_classifier)

In [32]:
deep_model.fit(X_train, y_train)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


  tensor_proto.tensor_content = nparray.tostring()
  if not isinstance(wrapped_dict, collections.Mapping):
  if not isinstance(values, collections.Sequence):
  tensor_proto.tensor_content = nparray.tostring()


Train on 169810 samples
Epoch 1/150
169810/169810 - 5s - loss: 0.6107 - acc: 0.6768
Epoch 2/150
169810/169810 - 5s - loss: 0.6031 - acc: 0.6827
Epoch 3/150
169810/169810 - 5s - loss: 0.6014 - acc: 0.6841
Epoch 4/150
169810/169810 - 5s - loss: 0.6008 - acc: 0.6843
Epoch 5/150
169810/169810 - 5s - loss: 0.6001 - acc: 0.6855
Epoch 6/150
169810/169810 - 5s - loss: 0.5995 - acc: 0.6856
Epoch 7/150
169810/169810 - 5s - loss: 0.5991 - acc: 0.6861
Epoch 8/150
169810/169810 - 5s - loss: 0.5985 - acc: 0.6859
Epoch 9/150
169810/169810 - 5s - loss: 0.5980 - acc: 0.6867
Epoch 10/150
169810/169810 - 5s - loss: 0.5977 - acc: 0.6866
Epoch 11/150
169810/169810 - 5s - loss: 0.5974 - acc: 0.6868
Epoch 12/150
169810/169810 - 5s - loss: 0.5969 - acc: 0.6865
Epoch 13/150
169810/169810 - 5s - loss: 0.5966 - acc: 0.6873
Epoch 14/150
169810/169810 - 5s - loss: 0.5961 - acc: 0.6881
Epoch 15/150
169810/169810 - 5s - loss: 0.5959 - acc: 0.6880
Epoch 16/150
169810/169810 - 5s - loss: 0.5954 - acc: 0.6880
Epoch 17/

Epoch 135/150
169810/169810 - 5s - loss: 0.5802 - acc: 0.6995
Epoch 136/150
169810/169810 - 5s - loss: 0.5802 - acc: 0.6993
Epoch 137/150
169810/169810 - 5s - loss: 0.5800 - acc: 0.6988
Epoch 138/150
169810/169810 - 5s - loss: 0.5801 - acc: 0.6997
Epoch 139/150
169810/169810 - 5s - loss: 0.5802 - acc: 0.6991
Epoch 140/150
169810/169810 - 5s - loss: 0.5800 - acc: 0.6991
Epoch 141/150
169810/169810 - 5s - loss: 0.5800 - acc: 0.6996
Epoch 142/150
169810/169810 - 5s - loss: 0.5799 - acc: 0.6993
Epoch 143/150
169810/169810 - 5s - loss: 0.5798 - acc: 0.6995
Epoch 144/150
169810/169810 - 5s - loss: 0.5801 - acc: 0.6993
Epoch 145/150
169810/169810 - 5s - loss: 0.5795 - acc: 0.6997
Epoch 146/150
169810/169810 - 5s - loss: 0.5796 - acc: 0.6997
Epoch 147/150
169810/169810 - 5s - loss: 0.5799 - acc: 0.6985
Epoch 148/150
169810/169810 - 5s - loss: 0.5797 - acc: 0.6997
Epoch 149/150
169810/169810 - 5s - loss: 0.5795 - acc: 0.6999
Epoch 150/150
169810/169810 - 4s - loss: 0.5795 - acc: 0.6999


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomundersampler', RandomUnderSampler()),
                ('kerasclassifier',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x0000023A2BB1D630>)])

In [33]:
print(f"DL Training Data Score: {deep_model.score(X_train, y_train)}")
print(f"DL Testing Data Score: {deep_model.score(X_test, y_test)}")

506759/506759 - 9s - loss: 0.6206 - acc: 0.6790
DL Training Data Score: 0.6789993047714233
168920/168920 - 3s - loss: 0.6388 - acc: 0.6672
DL Testing Data Score: 0.667203426361084


# Quantify the Models

In [34]:
deep_model_accuracy = deep_model.score(X_test, y_test)

168920/168920 - 3s - loss: 0.6388 - acc: 0.6672


In [35]:
print(f"DL Accuracy: {deep_model_accuracy}")

DL Accuracy: 0.667203426361084


# Save the Model

In [36]:
from joblib import dump, load
dump(model, "Neural_Net_Model_4_Diabetes.pkl")

  from collections import Container


NameError: name 'model' is not defined