# Install Dependent Libraries (Databricks)

Note: If running this notebook in Databricks, you will need the following libraries. If these libraries are not installed on your Databricks Cluster, you can simply uncomment and run the following cell to install those libraries in the notebook before you import the dependencies.

Libraries needed:
- koalas
- mlflow
- tensorflow
- imblearn

In [None]:
# dbutils.library.installPyPI("koalas")
# dbutils.library.installPyPI("mlflow")
# dbutils.library.installPyPI("tensorflow")
# dbutils.library.installPyPI("imblearn")
# dbutils.library.restartPython()

# Import Dependencies

In [2]:
# import databricks.koalas as ks
import pandas as pd

import numpy as np
import gzip

In [3]:
import mlflow.sklearn
import mlflow.keras

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.preprocessing import StandardScaler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [7]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, utils

# Create a Keras model that's compatible with scikit-learn
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import pickle
import tempfile
from tensorflow.keras.models import Sequential, load_model, save_model, Model
from tensorflow.keras.layers import Dense

# Connect to the AWS S3 Mount and Read CSV (Databricks only)

In [7]:
# ACCESS_KEY = "ENTER_YOUR_KEY_HERE" # dbutils.secrets.get(scope = "aws", key = "aws-access-key")
# SECRET_KEY = "ENTER_YOUR_KEY_HERE" # dbutils.secrets.get(scope = "aws", key = "aws-secret-key")
# ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
# AWS_BUCKET_NAME = "ENTER_YOUR_BUCKET_HERE" #Or the bucket you saved your data to
# MOUNT_NAME = "mnt_s3"
# s3_uri = f"s3a://{ACCESS_KEY}:{ENCODED_SECRET_KEY}@{AWS_BUCKET_NAME}"
# mount_uri = f"/mnt/{MOUNT_NAME}"
# display(dbutils.fs.ls(mount_uri))

In [8]:
# # Read CSVs
# df = pd.read_csv("/dbfs/mnt/%s/Project 3 Stuff/cod_clean.csv.gz" % MOUNT_NAME, compression="gzip")

# Read the CSV (Local Jupyter Notebook only)

In [8]:
# Read CSVs
df = pd.read_csv("../data/cod_clean.csv.gz", compression="gzip")

In [9]:
import numpy
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
pip install --upgrade xgboost

In [10]:
# Drop unnecessary column
df = df.drop(columns=["ICD Code", "Year", "Cause of Death"])
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Day of Week,Race,Hispanic Origin,Cause of Death Category
0,8th grade or less,June,M,85 years and over,Married,Saturday,White,Mexican,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,Saturday,White,Non - Hispanic white,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,Sunday,White,Non - Hispanic white,Neoplasms
3,high school graduate or GED completed,January,M,55 - 64 years,Married,Monday,White,Non - Hispanic white,External causes of morbidity and mortality
4,high school graduate or GED completed,January,M,75 - 84 years,Married,Sunday,White,Non - Hispanic white,Diseases of the circulatory system


In [11]:
df = df.dropna(axis='columns', how='all')

In [12]:
df = df[df["Education Level"] != "Unknown"]

In [13]:
df = df[df["Age Groups"] != "Age not stated"]

In [14]:
df = df[df["Marital Status"] != "Marital Status unknown"]

In [15]:
df = df[df["Day of Week"] != "Unknown"]

In [None]:
df

In [16]:
df8 = df.loc[(df["Cause of Death Category"] == "Diseases of the circulatory system")]

In [17]:
df["Cause of Death Category"].value_counts()

Diseases of the circulatory system                                                                     6045221
Neoplasms                                                                                              4432371
Diseases of the respiratory system                                                                     1827302
External causes of morbidity and mortality                                                             1406382
Diseases of the nervous system                                                                         1106993
Mental, behavioral and neurodevelopmental disorders                                                     899612
Endocrine, nutritional and metabolic diseases                                                           798858
Diseases of the digestive system                                                                        704731
Certain infectious and parasitic diseases                                                               500454
D

In [18]:
df8["Cause of Death Category"] = 'Yes'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
df9 = df.loc[df['Cause of Death Category'].isin(['Diseases of the nervous system','Neoplasms'])]

In [20]:
#, 'Diseases of the respiratory system', 'External causes of morbidity and mortality'

In [21]:
df9["Cause of Death Category"] = 'No'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
df9["Cause of Death Category"].value_counts()

No    5539364
Name: Cause of Death Category, dtype: int64

In [23]:
df10 = pd.concat([df8, df9])

In [24]:
dataset = df10.values
# split data into X and y
X = dataset[:,0:8]
X = X.astype(str)
Y = dataset[:,8]
# encode string input values as integers
encoded_x = None
for i in range(0, X.shape[1]):
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    feature = feature.reshape(X.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
    feature = onehot_encoder.fit_transform(feature)
    if encoded_x is None:
        encoded_x = feature
    else:
        encoded_x = numpy.concatenate((encoded_x, feature), axis=1)
print("X shape: : ", encoded_x.shape)

X shape: :  (11584585, 57)


In [26]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(encoded_x, label_encoded_y, test_size=test_size, random_state=seed)
# fit model no training data
model1 = XGBClassifier()
model1.fit(X_train, y_train)
print(model1)
# make predictions for test data
y_pred = model1.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier()
Accuracy: 59.85%


In [None]:
model

In [27]:
#pickle model
import pickle
file_name = "binary_xgb_reg.pkl"

# save pickle model 
pickle.dump(model1, open(file_name, "wb"))

In [28]:
#jolib model
import xgboost as xgb
import joblib
bst = model1
# filename = 'global.model'

bst.save_model('binary_global.model')

In [29]:
# # to save the model
# joblib.dump(bst, open(filename, 'wb'))

# # to load the saved model
# bst = joblib.load(open(filename, 'rb'))

In [30]:
test_df = pd.DataFrame(X_test[0, :])
test_df = test_df.T
test_df.to_csv("binarysample3.csv", index=False)

In [None]:
user_input = pd.read_csv("sample3.csv")
user_input

In [None]:
y_pred = xgb_model_loaded.predict(user_input.values)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test[[1]], predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
#Load pickle model 
xgb_model_loaded = pickle.load(open(file_name, "rb"))
#Predict values 
xgb_model_loaded.predict(user_input.values)

In [None]:
import xgboost as xgb

xgb.__version__

In [None]:
pip uninstall xgboost

In [None]:
pip install xgboost

In [None]:

xgb.__version__

In [None]:
dataset = df4.values
# split data into X and y
X = dataset[:,1:6]
X = X.astype(str)
Y = dataset[:,6]
# encode string input values as integers
encoded_x = None
for i in range(0, X.shape[1]):
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    feature = feature.reshape(X.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
    feature = onehot_encoder.fit_transform(feature)
    if encoded_x is None:
        encoded_x = feature
    else:
        encoded_x = numpy.concatenate((encoded_x, feature), axis=1)
print("X shape: : ", encoded_x.shape)
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(encoded_x, label_encoded_y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
loaded_model = pickle.load(open("pima.pickle.dat", "rb"))
# make predictions for test data
y_pred = loaded_model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
df5 = df.loc[(df["Cause of Death Category"] == "Neoplasms")]

In [None]:
df6 = df.loc[df['Cause of Death Category'].isin(['Diseases of the nervous system','Diseases of the circulatory system', 'Diseases of the respiratory system', 'External causes of morbidity and mortality'])]

In [None]:
df5["Cause of Death Category"] = 'Yes'

In [None]:
df6["Cause of Death Category"] = 'No'

In [None]:
df7 = pd.concat([df5, df6])

In [None]:
dataset = df7.values
# split data into X and y
X = dataset[:,1:6]
X = X.astype(str)
Y = dataset[:,6]
# encode string input values as integers
encoded_x = None
for i in range(0, X.shape[1]):
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    feature = feature.reshape(X.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
    feature = onehot_encoder.fit_transform(feature)
    if encoded_x is None:
        encoded_x = feature
    else:
        encoded_x = numpy.concatenate((encoded_x, feature), axis=1)
print("X shape: : ", encoded_x.shape)
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(encoded_x, label_encoded_y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
df10 = df.loc[(df["Cause of Death Category"] == "Diseases of the nervous system")]

In [None]:
df11 = df.loc[df['Cause of Death Category'].isin(['Diseases of the circulatory system','Neoplasms', 'Diseases of the respiratory system', 'External causes of morbidity and mortality'])]

In [None]:
df10["Cause of Death Category"] = 'Yes'

In [None]:
df11["Cause of Death Category"] = 'No'

In [None]:
df12 = pd.concat([df10, df11])

In [None]:
dataset = df12.values
# split data into X and y
X = dataset[:,1:6]
X = X.astype(str)
Y = dataset[:,6]
# encode string input values as integers
encoded_x = None
for i in range(0, X.shape[1]):
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    feature = feature.reshape(X.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
    feature = onehot_encoder.fit_transform(feature)
    if encoded_x is None:
        encoded_x = feature
    else:
        encoded_x = numpy.concatenate((encoded_x, feature), axis=1)
print("X shape: : ", encoded_x.shape)
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(encoded_x, label_encoded_y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
column_list = ["Education Level", "Month of Death", "Sex/Gender", "Age Groups", "Marital Status", "Race"]

In [None]:
df88 = df.loc[df['Cause of Death Category'].isin(["Diseases of the circulatory system", 'Diseases of the nervous system','Neoplasms', 'Diseases of the respiratory system', 'External causes of morbidity and mortality'])]

In [None]:
label_encoder = LabelEncoder()
onehotencoder = OneHotEncoder()

In [None]:
df88["Education Level"].value_counts()

In [None]:
df88 = df88[df88["Age Groups"] != "Age not stated"]

In [None]:
df88 = df88[df88["Marital Status"] != "Marital Status unknown"]

In [None]:
df88 = df88.dropna(thresh=2)

In [None]:
df88

In [None]:
for column in column_list:
    # Reshape column data; fit to the one-hot-encoder (expands columns)
    X = onehotencoder.fit_transform(df88[column].values.reshape(-1,1)).toarray()
    
    # Send the one-hot-encoded information from that column to a new dataframe
    dfOneHot = pd.DataFrame(X, columns = [column+str(int(i)) for i in range(X.shape[1])])
    
    # Merge the one-hot-encoded dataframe to the master dataframe
    df88 = df88.merge(dfOneHot, how="right", right_index=True, left_index=True)
    
    # Drop the column selected (no longer needed)
    selected_features = df88.drop([column], axis=1)

selected_features.head()

In [None]:
df88

In [None]:
# Import User Input
user_input = pd.read_csv("sample2.csv")
user_input

In [None]:
y_pred = model.predict(user_input)
predictions = [round(value) for value in y_pred]
predictions