# Install Dependent Libraries (Databricks)

Note: If running this notebook in Databricks, you will need the following libraries. If these libraries are not installed on your Databricks Cluster, you can simply uncomment and run the following cell to install those libraries in the notebook before you import the dependencies.

Libraries needed:
- koalas
- mlflow
- tensorflow
- imblearn

In [None]:
# dbutils.library.installPyPI("koalas")
# dbutils.library.installPyPI("mlflow")
# dbutils.library.installPyPI("tensorflow")
# dbutils.library.installPyPI("imblearn")
# dbutils.library.restartPython()

# Import Dependencies

In [1]:
# import databricks.koalas as ks
import pandas as pd

import numpy as np
import gzip

In [2]:
import mlflow.sklearn
import mlflow.keras

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.preprocessing import StandardScaler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [6]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, utils

# Create a Keras model that's compatible with scikit-learn
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import pickle
import tempfile
from tensorflow.keras.models import Sequential, load_model, save_model, Model
from tensorflow.keras.layers import Dense

# Connect to the AWS S3 Mount and Read CSV (Databricks only)

In [None]:
# ACCESS_KEY = "ENTER_YOUR_KEY_HERE" # dbutils.secrets.get(scope = "aws", key = "aws-access-key")
# SECRET_KEY = "ENTER_YOUR_KEY_HERE" # dbutils.secrets.get(scope = "aws", key = "aws-secret-key")
# ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
# AWS_BUCKET_NAME = "ENTER_YOUR_BUCKET_HERE" #Or the bucket you saved your data to
# MOUNT_NAME = "mnt_s3"
# s3_uri = f"s3a://{ACCESS_KEY}:{ENCODED_SECRET_KEY}@{AWS_BUCKET_NAME}"
# mount_uri = f"/mnt/{MOUNT_NAME}"
# display(dbutils.fs.ls(mount_uri))

In [None]:
# # Read CSVs
# df = pd.read_csv("/dbfs/mnt/%s/Project 3 Stuff/cod_clean.csv.gz" % MOUNT_NAME, compression="gzip")

# Read the CSV (Local Jupyter Notebook only)

In [7]:
# Read CSVs
df = pd.read_csv("../data/cod_clean.csv.gz", compression="gzip")

# Select your Features and Labels

In [8]:
# Drop unnecessary column
df = df.drop(columns="ICD Code")
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Day of Week,Year,Cause of Death,Race,Hispanic Origin,Cause of Death Category
0,8th grade or less,June,M,85 years and over,Married,Saturday,2005,All other forms of chronic ischemic heart dise...,White,Mexican,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,Saturday,2005,Other chronic obstructive pulmonary disease,White,Non - Hispanic white,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,Sunday,2005,"Of trachea, bronchus and lung",White,Non - Hispanic white,Neoplasms
3,high school graduate or GED completed,January,M,55 - 64 years,Married,Monday,2005,Intentional self-harm,White,Non - Hispanic white,External causes of morbidity and mortality
4,high school graduate or GED completed,January,M,75 - 84 years,Married,Sunday,2005,"Stroke, not specified as hemorrhage or infarct...",White,Non - Hispanic white,Diseases of the circulatory system


In [9]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Day of Week,Year,Cause of Death,Race,Hispanic Origin,Cause of Death Category
0,8th grade or less,June,M,85 years and over,Married,Saturday,2005,All other forms of chronic ischemic heart dise...,White,Mexican,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,Saturday,2005,Other chronic obstructive pulmonary disease,White,Non - Hispanic white,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,Sunday,2005,"Of trachea, bronchus and lung",White,Non - Hispanic white,Neoplasms
3,high school graduate or GED completed,January,M,55 - 64 years,Married,Monday,2005,Intentional self-harm,White,Non - Hispanic white,External causes of morbidity and mortality
4,high school graduate or GED completed,January,M,75 - 84 years,Married,Sunday,2005,"Stroke, not specified as hemorrhage or infarct...",White,Non - Hispanic white,Diseases of the circulatory system


In [10]:
# Select desired features
cleanup_df = df[["Cause of Death", "Cause of Death Category", "Year", "Month of Death", "Sex/Gender",
                        "Marital Status", "Age Groups", "Education Level", "Race"]]

In [11]:
df2 = cleanup_df.loc[cleanup_df['Cause of Death Category'].isin(['Diseases of the nervous system','Neoplasms', 'Diseases of the respiratory system', 'External causes of morbidity and mortality'])]

In [12]:
df2["Cause of Death Category"] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
df2["Cause of Death Category"].value_counts()

Other    9078564
Name: Cause of Death Category, dtype: int64

In [14]:
df3 = cleanup_df.loc[(cleanup_df["Cause of Death Category"] == "Diseases of the circulatory system")]

In [15]:
df3["Cause of Death Category"].value_counts()

Diseases of the circulatory system    6276232
Name: Cause of Death Category, dtype: int64

In [16]:
df4 = pd.concat([df2, df3])

In [17]:
# Select desired labels (5 causes of death, plus a "control group")
cleanup_df = cleanup_df.loc[(cleanup_df["Cause of Death Category"] == "Other cerebrovascular diseases and their sequelae ") | 
                            (cleanup_df["Cause of Death Category"] == "All other diseases of respiratory system ") |
                            (cleanup_df["Cause of Death Category"] == "External causes of morbidity and mortality") |
                            (cleanup_df["Cause of Death Category"] == "Alzheimer's disease ") |
                            (cleanup_df["Cause of Death Category"] == "Diabetes mellitus ") |
                            (cleanup_df["Cause of Death Category"] == "All other symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified ")
                            ]
cleanup_df = cleanup_df.reset_index(drop=True)

In [None]:
# Standardize values
cleanup_df = cleanup_df.replace({
    "Endocrine, nutritional and metabolic diseases": "Diabetes mellitus",
    "Diseases of the nervous system": "Alzheimer's Disease",
    "Diseases of the circulatory system": "Cerebrovascular Diseases",
    "Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified": "Other"
})

In [18]:
# Arrange final columns
selected_features = df4[["Cause of Death Category", "Year", "Month of Death", "Sex/Gender",
                        "Marital Status", "Age Groups", "Education Level", "Race"]]
selected_features.head()

Unnamed: 0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
1,Other,2005,January,F,Married,45 - 54 years,"9 - 12th grade, no diploma",White
2,Other,2005,January,F,Widowed,65 - 74 years,high school graduate or GED completed,White
3,Other,2005,January,M,Married,55 - 64 years,high school graduate or GED completed,White
7,Other,2005,January,F,Widowed,85 years and over,"some college credit, but no degree",White
11,Other,2005,January,F,Widowed,75 - 84 years,high school graduate or GED completed,White


In [None]:
selected_features

In [None]:
# Print labels
for x in selected_features["Cause of Death Category"].unique():
    print(x)

# Select Labels for Test and Control Groups

In [None]:
selected_features = selected_features.loc[(cleanup_df["Cause of Death Category"] == "External causes of morbidity and mortality"), 
                            (cleanup_df["Cause of Death Category"] == "All other diseases of respiratory system"),
                            (cleanup_df["Cause of Death Category"] == "External causes of morbidity and mortality"),
                            (cleanup_df["Cause of Death Category"] == "Alzheimer's disease"),
                            (cleanup_df["Cause of Death Category"] == "Diabetes mellitus"),
                            (cleanup_df["Cause of Death Category"] == "All other symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified")]
selected_features = selected_features.reset_index(drop=True)

In [None]:
selected_features

# Preview Data Distribution by Features and Classes

In [None]:
cod = selected_features.groupby("Cause of Death Category").count()
cod.sort_values(by="Year", ascending=False)

In [None]:
age = selected_features.groupby("Age Groups").count()
age.sort_values(by="Year", ascending=False)

In [None]:
race = selected_features.groupby("Race").count()
race.sort_values(by="Year", ascending=False)

In [None]:
selected_features["Marital Status"].value_counts()

In [19]:
selected_features = selected_features.drop(selected_features.index[selected_features['Marital Status'] == 'Marital Status unknown'])

In [20]:
selected_features = selected_features.drop(selected_features.index[selected_features['Education Level'] == 'Unknown'])

In [21]:
selected_features = selected_features.drop(selected_features.index[selected_features['Age Groups'] == 'Age not stated'])

In [31]:
selected_features["Race"].value_counts()

White                        12851884
Black                         1517056
Asian or Pacific Islander      359199
American Indian                 91015
Name: Race, dtype: int64

# Apply One-Hot Encoding

In [32]:
label_encoder = LabelEncoder()
onehotencoder = OneHotEncoder()

### Encode X data (features)

In [33]:
column_list = ["Month of Death", "Age Groups", "Education Level", "Sex/Gender", "Marital Status", "Race"]

In [None]:
for column in column_list:
    # Reshape column data; fit to the one-hot-encoder (expands columns)
    X = onehotencoder.fit_transform(selected_features[column].values.reshape(-1,1)).toarray()
    
    # Send the one-hot-encoded information from that column to a new dataframe
    dfOneHot = pd.DataFrame(X, columns = [column+str(int(i)) for i in range(X.shape[1])])
    
    # Merge the one-hot-encoded dataframe to the master dataframe
    selected_features = selected_features.merge(dfOneHot, how="right", right_index=True, left_index=True)
    
    # Drop the column selected (no longer needed)
    selected_features = selected_features.drop([column], axis=1)

selected_features.head()

### Encode Y data (labels/categories)

In [None]:
selected_features["Cause of Death Category"] = label_encoder.fit_transform(selected_features["Cause of Death Category"])
np.save('../Neural_Network_Trained_Models/saved_model/model_1_classes.npy', label_encoder.classes_)
selected_features.head()

### Select Data Values

In [None]:
X = selected_features.iloc[:, 1:45]
y = selected_features.iloc[:, 0]

# Create a Train Test Split

In [None]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Perform Random Under Sampling and Standard Scaling
data_transform = make_pipeline_imb(StandardScaler(), RandomUnderSampler())

X_train_resample, y_train_resample = RandomUnderSampler().fit_resample(X_train, y_train)
# X_train_resample = StandardScaler().fit_transform(X_train_resample)

X_test_resample, y_test_resample = RandomUnderSampler().fit_resample(X_test, y_test)
# X_test_resample = StandardScaler().fit_transform(X_test_resample)

In [None]:
X_train_resample

# Make Keras Pickle-able

Boiler plate code found at https://github.com/tensorflow/tensorflow/issues/34697

In [None]:
# Hotfix function
def make_keras_picklable():
    def __getstate__(self):
        model_str = ""
        with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
            save_model(self, fd.name, overwrite=True)
            model_str = fd.read()
        d = {'model_str': model_str}
        return d

    def __setstate__(self, state):
        with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
            fd.write(state['model_str'])
            fd.flush()
            model = load_model(fd.name)
        self.__dict__ = model.__dict__


    cls = Model
    cls.__getstate__ = __getstate__
    cls.__setstate__ = __setstate__

# Run the function
make_keras_picklable()

# Make a Keras Deep Learning Classifier

In [None]:
classifier = models.Sequential()
number_inputs = 44
first_hidden_layer = 60
second_hidden_layer = 48
third_hidden_layer = 36
fourth_hidden_layer = 24
fifth_hidden_layer = 12
sixth_hidden_layer = 6
number_classes = 2

classifier.add(layers.Dense(units=first_hidden_layer, activation='relu', input_dim=number_inputs))
classifier.add(layers.Dense(units=second_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=third_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=fourth_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=fifth_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=sixth_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=number_classes, activation='relu'))
classifier.compile(optimizer='adam', loss='categorical_hinge', metrics=['accuracy'])

# Fit Model

In [None]:
classifier.fit(X_train_resample, y_train_resample, epochs=80)

In [None]:
classifier.summary()

# Perform Predictions

In [None]:
results = classifier.evaluate(X_test_resample, y_test_resample)

In [None]:
print(f"Model Accuracy: {results[1]*100}%")

In [None]:
# Import User Input
user_input = pd.read_csv("sample.csv")

In [None]:
predictions = classifier.predict(user_input)
predicted_class_num = classifier.predict_classes(user_input)
predicted_class_string = label_encoder.inverse_transform(predicted_class_num)
predicted_accuracy = predictions[0,1]*100

In [None]:
print(f"Class: {predicted_class_num}")
print(f"Class: {predicted_class_string}")
print(f"Probability: {predicted_accuracy}")

In [None]:
y_test_resample.head(10)

# Save the Model

In [None]:
classifier.save("saved_model/Model_1_External_Causes.h5")

In [None]:
External_Causes_Model = tf.keras.models.load_model("saved_model/Model_1_External_Causes.h5")

In [None]:
test_df = pd.DataFrame(X_test_resample.iloc[0, :])
test_df = test_df.T

In [None]:
X_test.iloc[0].T.to_csv("sample.csv")