# Install Dependent Libraries (Databricks)

Note: If running this notebook in Databricks, you will need the following libraries. If these libraries are not installed on your Databricks Cluster, you can simply uncomment and run the following cell to install those libraries in the notebook before you import the dependencies.

Libraries needed:
- koalas
- mlflow
- tensorflow
- imblearn

In [None]:
# dbutils.library.installPyPI("koalas")
# dbutils.library.installPyPI("mlflow")
# dbutils.library.installPyPI("tensorflow")
# dbutils.library.installPyPI("imblearn")
# dbutils.library.restartPython()

# Install Dependent Libraries (Python)

In [None]:
#pip install --upgrade xgboost

# Import Dependencies

In [1]:
# import databricks.koalas as ks
import pandas as pd

import numpy as np
import gzip

In [2]:
import numpy
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Connect to the AWS S3 Mount and Read CSV (Databricks only)

In [None]:
# ACCESS_KEY = "ENTER_YOUR_KEY_HERE" # dbutils.secrets.get(scope = "aws", key = "aws-access-key")
# SECRET_KEY = "ENTER_YOUR_KEY_HERE" # dbutils.secrets.get(scope = "aws", key = "aws-secret-key")
# ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
# AWS_BUCKET_NAME = "ENTER_YOUR_BUCKET_HERE" #Or the bucket you saved your data to
# MOUNT_NAME = "mnt_s3"
# s3_uri = f"s3a://{ACCESS_KEY}:{ENCODED_SECRET_KEY}@{AWS_BUCKET_NAME}"
# mount_uri = f"/mnt/{MOUNT_NAME}"
# display(dbutils.fs.ls(mount_uri))

In [None]:
# # Read CSVs
# df = pd.read_csv("/dbfs/mnt/%s/Project 3 Stuff/cod_clean.csv.gz" % MOUNT_NAME, compression="gzip")

# Read the CSV (Local Jupyter Notebook only)

In [3]:
# Read CSVs
df = pd.read_csv("../data/cod_clean.csv.gz", compression="gzip")

In [4]:
# Drop unnecessary column
df = df.drop(columns=["ICD Code", "Year", "Cause of Death"])

In [5]:
#Drop NaN and Unknown Data
df = df.dropna(axis='columns', how='all')
df = df[df["Education Level"] != "Unknown"]
df = df[df["Age Groups"] != "Age not stated"]
df = df[df["Marital Status"] != "Marital Status unknown"]
df = df[df["Day of Week"] != "Unknown"]

In [6]:
df2 = df.loc[df['Cause of Death Category'].isin(['Diseases of the circulatory system','Diseases of the nervous system','Neoplasms'])]

In [7]:
df2

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Day of Week,Race,Hispanic Origin,Cause of Death Category
0,8th grade or less,June,M,85 years and over,Married,Saturday,White,Mexican,Diseases of the circulatory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,Sunday,White,Non - Hispanic white,Neoplasms
4,high school graduate or GED completed,January,M,75 - 84 years,Married,Sunday,White,Non - Hispanic white,Diseases of the circulatory system
5,high school graduate or GED completed,January,F,65 - 74 years,Widowed,Saturday,White,Non - Hispanic white,Diseases of the circulatory system
9,high school graduate or GED completed,January,F,75 - 84 years,Widowed,Sunday,White,Non - Hispanic white,Diseases of the circulatory system
...,...,...,...,...,...,...,...,...,...
19431017,Associate degree,December,F,65 - 74 years,"Never married, single",Tuesday,Black,Non - Hispanic black,Diseases of the circulatory system
19431022,high school graduate or GED completed,December,M,55 - 64 years,Divorced,Thursday,Black,Non - Hispanic black,Neoplasms
19431026,8th grade or less,December,M,65 - 74 years,Divorced,Tuesday,White,Puerto Rican,Diseases of the nervous system
19431029,Master’s degree,December,M,75 - 84 years,Widowed,Friday,White,Non - Hispanic white,Neoplasms


In [8]:
dataset = df2.values
# split data into X and y
X = dataset[:,0:8]
X = X.astype(str)
Y = dataset[:,8]
# encode string input values as integers
encoded_x = None
for i in range(0, X.shape[1]):
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    feature = feature.reshape(X.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
    feature = onehot_encoder.fit_transform(feature)
    if encoded_x is None:
        encoded_x = feature
    else:
        encoded_x = numpy.concatenate((encoded_x, feature), axis=1)
print("X shape: : ", encoded_x.shape)

X shape: :  (11584585, 57)


In [9]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(encoded_x, label_encoded_y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



KeyboardInterrupt: 

In [None]:
#save model
import xgboost as xgb
import joblib
bst = model

bst.save_model('global.model')