In [None]:
#########################################################
### Analysis of Readmission Data for 10,000 Diabetics ###
#########################################################


# Load libraries
library(RODBC) # for rodbc queries
library(pandas)
library(plotly) # for plotting
library(tidyverse) # for data manipulation and graphing
library(data.table)
library(prettyR)
library(DataExplorer) # data exploration
library(caret)
library(tictoc) #measures time
library(xgboost)
library(Matrix)
library(fairml) # for bias and fairness assessment. Not used in final model.
library(datarobot) # used to import model into Data Robot for bias/fairness


#Load data
import pandas as pd
admit_dat = pd.read_csv ('readmissions.csv')

#format data

## no manipulation or transformation required in Python.  All work was done manually in Excel.
admit_dat_sm <- sparse.model.matrix(IS_READMIT ~ . 0,
                                    data - readmit_dat[,-c(1,
                                                          which(colnames(readmit_dat) %in% c("testset"))
                                                          )
                                                      ]
                                   )

#split data sets 
train_xgboost_dat <- readmit_dat_sm[noshow_dat$testset==FALSE,]
test_xgboost_dat <- readmit_dat_sm[noshow_dat$testset==TRUE,]

#create and store XGB matrices
train_xgboost <- xgb.DMatrix(data = train_xgboost_dat, label = train_xgboost_label)
test_xgboost <- xgb.DMatrix(data = train_xgboost_dat, label = train_xgboost_label)

#parameter list for XGBoost model
params <- list(booster = "gbtree",
              objective = "binary:logistic",
              eta = 0.1,
              gamma = 0,
              max_depth = 5,
              min_child_weight = 1,
              subsample = 0.7,
              colsamply_bytree = 0.3,
               scale_pos_weight = 1.0
              )

#train model
tic("get")
xgb_model <- xgb.train(params = params,
                      data = train_xgboost,
                      nrounds = 200,
                      watchlist - list(train=train.xgboost, test = test_xgboost),
                      print_every_n = 25,
                      early_stop_round = 10,
                       maximize = F,
                       eval_metric = "auc"
                      )

toc() #gets time elapsed

#plotting
#feature importance
xgb_importance <- xgb.importance(features_names = colnames(train)xgboost_dat), model = xgb_model)
xgb.plot.importance(importance_matrix = xgb_importance[1:20])

#importance top 10
xgb_importance %>%
top_n(10, Gain) %>%
ggplot(aes(x = reorder(Feature, Gain), y - Gain)) +
geom_bar(stat = "identity", fill = "#001111", alpha = 0.7) + 
xlab("") +
scale_y_continuous(breaks = seq(0,.3, .05))+
coord_flip()

#retrieve predictions
xgbpred <- predict(xgbmodel, test_xgboost)
head(xgbpred)

xgblabel <- getinfo(test_xgboost, "label")
head(xgblabel)

#ROC and AUC
PRROC::roc.curve(scores.class= - fg, scores.class1 = bg, curve = TRUE)
        
pr.curve(scores.class0 = fg, scores.class1 = bg, curve = TRUE)

rocauc3 <- PRROC::roc.curve(scores.class0 = fg, scores.class1 = bg, curve = TRUE)
plot(rocauc3)

ra3 <- pr.curve(scores.class0=fg, scores.class1=bg, curve = TRUE)
plot(ra3)


## Performance measurements
xgbpred_label <- if_else(xgbpred >.5, 1, 0)
confusionMatrix(table(xgbpred_label, xgblabel), positive = "1")

xgbpred_label <- if_else(xgbpred > .6, 1, 0)
confusionMatrix(table(xgbpred_label, xgblabel), positive = "1")

xgbpred_label <- if_else(xgbpred >.7, 1, 0)
confusionMatrix(table(xgbpred_label, xgblabel), positive = "1")

xgbpred_label <- if_else(xgbpred > .8, 1, 0)
confusionMatrix(table(xgbpred_label, xgblabel), positive = "1")

xgbpred_label <- if_else(xgbpred >.9, 1, 0)
confusionMatrix(table(xgbpred_label, xgblabel), positive = "1")

xgbpred_label <- if_else(xgbpred > .92, 1, 0)
confusionMatrix(table(xgbpred_label, xgblabel), positive = "1")

##################################################
## This is run within the datarobot environment ##
##################################################


#import into datarobot for bias and fairness
import datarobot as dr

# use execution_environment created earlier

environment_version = dr.ExecutionEnvironmentVersion.create(
    execution_environment.id,
    docker_context_path="datarobot-user-models/public_dropin_environments/python3_pytorch",
    max_wait=None,  # set None to not block execution on this method
)

environment_version.id

environment_version.build_status


# after some time
environment_version.refresh()
environment_version.build_status


#import model into datarobot
custom_model = dr.CustomInferenceModel.create(
    name='Python 3 PyTorch Custom Model',
    target_type=dr.TARGET_TYPE.BINARY,
    target_name='readmitted',
    positive_class_label='False',
    negative_class_label='True',
    description='This is a Python3-based custom model. It has a simple PyTorch model built on 10k_diabetes dataset',
    language='Python 3'
)

custom_model.id

#### from here, the original code above is run in the datarobot environment
#### syntax was updated in some cases
####
#
#
#
# Remaining work to tune was done within datarobot GUI
# all work on bias and fairness was done in datarobot GUI (no coding options available)
# 
#
#

