# Install Dependent Libraries (Databricks)

Note: If running this notebook in Databricks, you will need the following libraries. If these libraries are not installed on your Databricks Cluster, you can simply uncomment and run the following cell to install those libraries in the notebook before you import the dependencies.

Libraries needed:
- koalas
- mlflow
- tensorflow
- imblearn

In [1]:
# dbutils.library.installPyPI("koalas")
# dbutils.library.installPyPI("mlflow")
# dbutils.library.installPyPI("tensorflow")
# dbutils.library.installPyPI("imblearn")
# dbutils.library.restartPython()

# Import Dependencies

In [2]:
# import databricks.koalas as ks
import pandas as pd

import numpy as np
import gzip

In [3]:
import mlflow.sklearn

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.preprocessing import StandardScaler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [7]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, utils

# Create a Keras model that's compatible with scikit-learn
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Read the CSV and Perform Basic Data Cleaning

In [10]:
# Read CSVs
df = pd.read_csv("../data/cod_clean.csv.gz", compression="gzip")

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
# Drop unnecessary column
df = df.drop(columns="icd_code_10")
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Year,ICD Code,Race,Cause of Death
0,8th grade or less,June,M,85 years and over,Married,2005,I251,White,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,2005,J449,White,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,2005,C349,White,Neoplasms
3,high school graduate or GED completed,January,M,55 - 64 years,Married,2005,X72,White,
4,high school graduate or GED completed,January,M,75 - 84 years,Married,2005,I64,White,Diseases of the circulatory system


In [12]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Year,ICD Code,Race,Cause of Death
0,8th grade or less,June,M,85 years and over,Married,2005,I251,White,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,2005,J449,White,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,2005,C349,White,Neoplasms
4,high school graduate or GED completed,January,M,75 - 84 years,Married,2005,I64,White,Diseases of the circulatory system
5,high school graduate or GED completed,January,F,65 - 74 years,Widowed,2005,I269,White,Diseases of the circulatory system


# Select your Features and Labels

In [13]:
selected_features = df[["Cause of Death", "Year", "Month of Death", "Sex/Gender",
                        "Marital Status", "Age Groups", "Education Level", "Race"]]
selected_features.head()

Unnamed: 0,Cause of Death,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
0,Diseases of the circulatory system,2005,June,M,Married,85 years and over,8th grade or less,White
1,Diseases of the respiratory system,2005,January,F,Married,45 - 54 years,"9 - 12th grade, no diploma",White
2,Neoplasms,2005,January,F,Widowed,65 - 74 years,high school graduate or GED completed,White
4,Diseases of the circulatory system,2005,January,M,Married,75 - 84 years,high school graduate or GED completed,White
5,Diseases of the circulatory system,2005,January,F,Widowed,65 - 74 years,high school graduate or GED completed,White


In [14]:
selected_features = selected_features.loc[(selected_features["Cause of Death"] == "Diseases of the respiratory system") | 
                                         (selected_features["Cause of Death"] == "External causes of morbidity and mortality") |
                                         (selected_features["Cause of Death"] == "Diseases of the nervous system") |
                                         (selected_features["Cause of Death"] == "Endocrine, nutritional and metabolic diseases") |
                                         (selected_features["Cause of Death"] == "Mental and behavioural disorders")
                                         ]

selected_features = selected_features.reset_index(drop=True)

In [15]:
selected_features.head()

Unnamed: 0,Cause of Death,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
0,Diseases of the respiratory system,2005,January,F,Married,45 - 54 years,"9 - 12th grade, no diploma",White
1,Diseases of the respiratory system,2005,January,F,Widowed,85 years and over,"some college credit, but no degree",White
2,Diseases of the respiratory system,2005,January,F,Widowed,75 - 84 years,high school graduate or GED completed,White
3,Mental and behavioural disorders,2005,January,F,Widowed,75 - 84 years,high school graduate or GED completed,White
4,Diseases of the nervous system,2005,January,M,Widowed,85 years and over,high school graduate or GED completed,White


# Change Labels to "Binary" Categories

In [16]:
selected_features = selected_features.replace({
    "External causes of morbidity and mortality": "Not Diseases of the respiratory system",
    "Diseases of the nervous system": "Not Diseases of the respiratory system",
    "Endocrine, nutritional and metabolic diseases": "Not Diseases of the respiratory system",
    "Mental and behavioural disorders": "Not Diseases of the respiratory system"
    })

In [17]:
selected_features

Unnamed: 0,Cause of Death,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
0,Diseases of the respiratory system,2005,January,F,Married,45 - 54 years,"9 - 12th grade, no diploma",White
1,Diseases of the respiratory system,2005,January,F,Widowed,85 years and over,"some college credit, but no degree",White
2,Diseases of the respiratory system,2005,January,F,Widowed,75 - 84 years,high school graduate or GED completed,White
3,Not Diseases of the respiratory system,2005,January,F,Widowed,75 - 84 years,high school graduate or GED completed,White
4,Not Diseases of the respiratory system,2005,January,M,Widowed,85 years and over,high school graduate or GED completed,White
...,...,...,...,...,...,...,...,...
4384068,Not Diseases of the respiratory system,2015,December,F,"Never married, single",45 - 54 years,high school graduate or GED completed,Black
4384069,Not Diseases of the respiratory system,2015,December,F,Married,55 - 64 years,8th grade or less,Asian or Pacific Islander
4384070,Not Diseases of the respiratory system,2015,December,M,"Never married, single",Under 1 year (includes not stated infant ages),8th grade or less,Black
4384071,Not Diseases of the respiratory system,2015,December,M,"Never married, single",65 - 74 years,"9 - 12th grade, no diploma",Black


# Preview Data Distribution by Features and Classes

In [18]:
cod = selected_features.groupby("Cause of Death").count()
cod.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
Cause of Death,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Not Diseases of the respiratory system,2751705,2751705,2751705,2751705,2751705,2751705,2751705
Diseases of the respiratory system,1632368,1632368,1632368,1632368,1632368,1632368,1632368


In [19]:
age = selected_features.groupby("Age Groups").count()
age.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Cause of Death,Year,Month of Death,Sex/Gender,Marital Status,Education Level,Race
Age Groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
85 years and over,1706088,1706088,1706088,1706088,1706088,1706088,1706088
75 - 84 years,1199076,1199076,1199076,1199076,1199076,1199076,1199076
65 - 74 years,633425,633425,633425,633425,633425,633425,633425
55 - 64 years,379306,379306,379306,379306,379306,379306,379306
45 - 54 years,203452,203452,203452,203452,203452,203452,203452
35 - 44 years,93971,93971,93971,93971,93971,93971,93971
25 - 34 years,71219,71219,71219,71219,71219,71219,71219
15 - 24 years,69447,69447,69447,69447,69447,69447,69447
5 - 14 years,13679,13679,13679,13679,13679,13679,13679
Under 1 year (includes not stated infant ages),7223,7223,7223,7223,7223,7223,7223


In [20]:
race = selected_features.groupby("Race").count()
race.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Cause of Death,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level
Race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
White,3878168,3878168,3878168,3878168,3878168,3878168,3878168
Black,385827,385827,385827,385827,385827,385827,385827
Asian or Pacific Islander,88545,88545,88545,88545,88545,88545,88545
American Indian,31533,31533,31533,31533,31533,31533,31533


# Send Dataframe information to CSV

In [21]:
# selected_features.iloc[0:5000].head()

In [22]:
# month = selected_features["Month of Death"].unique()
# sex = selected_features["Sex/Gender"].unique()
# marital = selected_features["Marital Status"].unique()
# age_groups = selected_features["Age Groups"].unique()
# education = selected_features["Education Level"].unique()
# race = selected_features["Race"].unique()

# features_list = [month, sex, marital, age_groups, education, race]

In [23]:
# month[0]
# january = selected_features.loc[selected_features["Month of Death"] == month[0]]
# new_df = pd.DataFrame(january.iloc[0])
# new_df = new_df.T #.append(january.iloc[1])
# new_df = new_df.append(january.iloc[1])
# new_df

In [24]:
# features_for_CSV = pd.DataFrame(selected_features.iloc[0]).T
# selected_features[column_list[0]]
# selected_features["Month of Death"][0]
# features_list[0][0]
# selection = selected_features.loc[selected_features[column_list[0]][0]]

# for i in column_list:
#     selection = selected_features.loc[selected_features[column_list[0]] == selected_features[column_list[i]][features_list[0][0]]]
#     new_df = pd.DataFrame(selection.iloc[0])
#     features_for_CSV.append(new_df)
# features_for_CSV

# Apply One-Hot Encoding

In [25]:
label_encoder = LabelEncoder()
onehotencoder = OneHotEncoder()

### Encode X data (features)

In [26]:
column_list = ["Month of Death", "Age Groups", "Education Level", "Sex/Gender", "Marital Status", "Race"]

In [27]:
for column in column_list:
    # Reshape column data; fit to the one-hot-encoder (expands columns)
    X = onehotencoder.fit_transform(selected_features[column].values.reshape(-1,1)).toarray()
    
    # Send the one-hot-encoded information from that column to a new dataframe
    dfOneHot = pd.DataFrame(X, columns = [column+str(int(i)) for i in range(X.shape[1])])
    
    # Merge the one-hot-encoded dataframe to the master dataframe
    selected_features = selected_features.merge(dfOneHot, how="right", right_index=True, left_index=True)
    
    # Drop the column selected (no longer needed)
    selected_features = selected_features.drop([column], axis=1)

selected_features.head()

Unnamed: 0,Cause of Death,Year,Month of Death0,Month of Death1,Month of Death2,Month of Death3,Month of Death4,Month of Death5,Month of Death6,Month of Death7,...,Sex/Gender1,Marital Status0,Marital Status1,Marital Status2,Marital Status3,Marital Status4,Race0,Race1,Race2,Race3
0,Diseases of the respiratory system,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Diseases of the respiratory system,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,Diseases of the respiratory system,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,Not Diseases of the respiratory system,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,Not Diseases of the respiratory system,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Encode Y data (labels/categories)

In [28]:
selected_features["Cause of Death"] = label_encoder.fit_transform(selected_features["Cause of Death"])
selected_features.head()

Unnamed: 0,Cause of Death,Year,Month of Death0,Month of Death1,Month of Death2,Month of Death3,Month of Death4,Month of Death5,Month of Death6,Month of Death7,...,Sex/Gender1,Marital Status0,Marital Status1,Marital Status2,Marital Status3,Marital Status4,Race0,Race1,Race2,Race3
0,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Select Data Values

In [29]:
data = selected_features.values
X = data[:, 1:]
y = data[:, 0]


In [30]:
X.shape

(4384073, 44)

# Make a Deep Learning Classifier

In [31]:
def build_DL_classifier():
    classifier = models.Sequential()
    number_inputs = 44
    first_hidden_layer = 60
    second_hidden_layer = 30
    third_hidden_layer = 30
    fourth_hidden_layer = 60
    number_classes = 2
    
    classifier.add(layers.Dense(units=first_hidden_layer, activation='relu', input_dim=number_inputs))
    classifier.add(layers.Dense(units=second_hidden_layer, activation='relu'))
    classifier.add(layers.Dense(units=third_hidden_layer, activation='relu'))
    classifier.add(layers.Dense(units=fourth_hidden_layer, activation='relu'))
    classifier.add(layers.Dense(units=number_classes, activation='softmax'))
    classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return classifier

# keras_DL_classifier = KerasClassifier(build_DL_classifier, epochs=150, shuffle=True, verbose=2, callbacks=[EarlyStopping(monitor='accuracy', patience=20, verbose=2)])
keras_DL_classifier = KerasClassifier(build_DL_classifier, epochs=150, shuffle=True, verbose=2, callbacks=[EarlyStopping(monitor='loss', patience=20, verbose=2)])

# Create a Train Test Split

In [32]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Preprocess the Data and Adjust Imbalanced Data

In [33]:
deep_model = make_pipeline_imb(StandardScaler(), RandomUnderSampler(), keras_DL_classifier)

In [34]:
deep_model.fit(X_train, y_train)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


  tensor_proto.tensor_content = nparray.tostring()
  if not isinstance(wrapped_dict, collections.Mapping):
  if not isinstance(values, collections.Sequence):
  tensor_proto.tensor_content = nparray.tostring()


Train on 2449068 samples
Epoch 1/150
2449068/2449068 - 61s - loss: 0.6599 - acc: 0.6001
Epoch 2/150
2449068/2449068 - 62s - loss: 0.6588 - acc: 0.6013
Epoch 3/150
2449068/2449068 - 61s - loss: 0.6585 - acc: 0.6016
Epoch 4/150
2449068/2449068 - 60s - loss: 0.6584 - acc: 0.6018
Epoch 5/150
2449068/2449068 - 59s - loss: 0.6583 - acc: 0.6017
Epoch 6/150
2449068/2449068 - 60s - loss: 0.6582 - acc: 0.6019
Epoch 7/150
2449068/2449068 - 60s - loss: 0.6582 - acc: 0.6017
Epoch 8/150
2449068/2449068 - 60s - loss: 0.6582 - acc: 0.6021
Epoch 9/150
2449068/2449068 - 59s - loss: 0.6581 - acc: 0.6022
Epoch 10/150
2449068/2449068 - 60s - loss: 0.6581 - acc: 0.6021
Epoch 11/150
2449068/2449068 - 59s - loss: 0.6581 - acc: 0.6024
Epoch 12/150
2449068/2449068 - 60s - loss: 0.6580 - acc: 0.6024
Epoch 13/150
2449068/2449068 - 59s - loss: 0.6580 - acc: 0.6025
Epoch 14/150
2449068/2449068 - 60s - loss: 0.6580 - acc: 0.6024
Epoch 15/150
2449068/2449068 - 60s - loss: 0.6579 - acc: 0.6025
Epoch 16/150
2449068/244

Epoch 129/150
2449068/2449068 - 59s - loss: 0.6575 - acc: 0.6032
Epoch 130/150
2449068/2449068 - 59s - loss: 0.6574 - acc: 0.6032
Epoch 131/150
2449068/2449068 - 59s - loss: 0.6575 - acc: 0.6033
Epoch 132/150
2449068/2449068 - 60s - loss: 0.6574 - acc: 0.6035
Epoch 133/150
2449068/2449068 - 60s - loss: 0.6575 - acc: 0.6032
Epoch 134/150
2449068/2449068 - 59s - loss: 0.6575 - acc: 0.6031
Epoch 135/150
2449068/2449068 - 59s - loss: 0.6576 - acc: 0.6033
Epoch 136/150
2449068/2449068 - 60s - loss: 0.6574 - acc: 0.6033
Epoch 137/150
2449068/2449068 - 60s - loss: 0.6574 - acc: 0.6034
Epoch 138/150
2449068/2449068 - 60s - loss: 0.6575 - acc: 0.6033
Epoch 139/150
2449068/2449068 - 60s - loss: 0.6576 - acc: 0.6032
Epoch 140/150
2449068/2449068 - 59s - loss: 0.6575 - acc: 0.6033
Epoch 141/150
2449068/2449068 - 59s - loss: 0.6575 - acc: 0.6032
Epoch 142/150
2449068/2449068 - 59s - loss: 0.6575 - acc: 0.6032
Epoch 143/150
2449068/2449068 - 59s - loss: 0.6574 - acc: 0.6032
Epoch 144/150
2449068/244

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomundersampler', RandomUnderSampler()),
                ('kerasclassifier',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x00000124FE252F28>)])

In [35]:
print(f"DL Training Data Score: {deep_model.score(X_train, y_train)}")
print(f"DL Testing Data Score: {deep_model.score(X_test, y_test)}")

3288054/3288054 - 53s - loss: 0.6595 - acc: 0.5897
DL Training Data Score: 0.5897381901741028
1096019/1096019 - 19s - loss: 0.6606 - acc: 0.5878
DL Testing Data Score: 0.5878419876098633


# Quantify the Models

In [36]:
deep_model_accuracy = deep_model.score(X_test, y_test)

1096019/1096019 - 18s - loss: 0.6606 - acc: 0.5878


In [37]:
print(f"DL Accuracy: {deep_model_accuracy}")

DL Accuracy: 0.5878419876098633


# Save the Model