# Bank Marketing in R using h2o AutoML

More info on H2O AutoML at https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

Install and load required packages

In [None]:
if(!require('h2o')){install.packages('h2o')}
if(!require('h2o4gpu')){install.packages('h2o4gpu')}
if(!require('getPass')){install.packages('getPass')}

In [None]:
library('tidyverse')
library('caret')
library("DBI")
library('dplyr')
library('dbplyr')
library('tdplyr')

Replace your cluster details for your user, password, host and database.

In [None]:
host <- "3.238.151.85"
user <- "AOA_DEMO"
database <- "AOA_DEMO"
password <- getPass::getPass("Enter the password: ")
params <- sprintf("{\"host\":\"%s\",\"user\":\"%s\",\"password\":\"%s\",\"database\":\"%s\"}", host, user, password, database)
con <- DBI::dbConnect(teradatasql::TeradataDriver(), paste0(params)) 

Load the dataset

In [None]:
bank <- DBI::dbGetQuery(con, "select * from bank_marketing_data")
bank$job <- as.factor(bank$job)
bank$marital <- as.factor(bank$marital)
bank$education <- as.factor(bank$education)
bank$default <- as.factor(bank$default)
bank$balance <- as.integer(bank$balance)
bank$housing <- as.factor(bank$housing)
bank$loan <- as.factor(bank$loan)
bank$contact <- as.factor(bank$contact)
bank$day <- as.integer(bank$day)
bank$month <- as.factor(bank$month)
bank$pdays <- as.integer(bank$pdays)
bank$poutcome <- as.factor(bank$poutcome)
bank$y <- as.factor(bank$y)

Let's inspect the dataset

In [None]:
str(bank)

In [None]:
bank

Check for NA values

In [None]:
table(is.na(bank))

Find the number of unique ages to choose histogram bin number

In [None]:
binnum <- length(unique(bank$age))

ggplot(bank, aes(age, fill = y)) + 
  geom_histogram(bins = binnum) + 
  ggtitle("Age of subscribers and non-subscribers") +
  xlab("Age") + 
  ylab("# of subscribers") +
  theme_bw() +
  scale_fill_brewer(palette="Set2") +
  theme(legend.title=element_blank())

In the histogram above we can see that there is one customer age that is significantly more prevalent than others.

Now I will visualize subscriber percentage by education level, marital status, occupation, and age:

In [None]:
### subscriber % by education
yn <- bank %>% group_by(education, y) %>% summarise(n = n(), .groups = 'keep')
yed <- bank %>% group_by(education) %>% summarise(n = n())
jed <- left_join(yn, yed, by = "education")
jed <- jed %>% mutate(perc = round((n.x / n.y) * 100, digits = 0))

ggplot(jed, aes(x = education, y = perc, fill = y, label = perc)) + 
  geom_bar(stat = "identity", alpha = 0.7) + 
  geom_text(position = "stack", size = 6) + 
  ggtitle("Percentage of subscribers by education level") + 
  xlab("") +
  ylab("% subscribed") +
  scale_fill_brewer(palette="Set3") +
  theme_bw() +
  theme(legend.title=element_blank()) +
  coord_flip()

In [None]:
### subscriber % by marital status
mar <- bank %>% group_by(marital, y) %>% summarise(n = n(), .groups = 'keep')
ymar <- bank %>% group_by(marital) %>% summarise(n = n())
jmar <- left_join(mar, ymar, by = "marital")
jmar <- jmar %>% mutate(perc = round((n.x / n.y) * 100, digits = 0))

ggplot(jmar, aes(x = marital, y = perc, fill = y, label = perc)) + 
  geom_bar(stat = "identity", alpha = 0.7) + 
  geom_text(position = "stack", size = 6)  + 
  scale_fill_brewer(palette="Spectral") +
  ggtitle("Percentage of subscribers by marital status") +
  xlab("") +
  ylab("% subscribed") +
  theme_bw() +
  theme(legend.title=element_blank())+
  coord_flip()

In [None]:
### subscribed by job type
ageout <- data.frame(table(bank$job, bank$y))
colnames(ageout) <- c("job", "y", "Freq")
jobs <- bank %>% group_by(job) %>% summarise(n = n())
aj <- left_join(ageout, jobs, by = "job")
aj <- aj %>% mutate(perc = round((Freq / n) * 100, digits = 0))

ggplot(aj, aes(x = job, y = perc, fill = y, label = perc)) + 
  geom_bar(stat = "identity", alpha = 0.7) + 
  geom_text(position = "stack", size = 6)  + 
  scale_fill_brewer(palette="Set2") +
  ggtitle("Percentage of subscribers by occupation") +
  xlab("") +
  ylab("% subscribed") +
  theme_bw() +
  theme(legend.title=element_blank())+
  coord_flip()

In [None]:
### subscriber by age
age <- bank %>% group_by(age, y) %>% summarise(n = n(), .groups = 'keep')
yage <- bank %>% group_by(age) %>% summarise(n = n())
jage <- left_join(age, yage, by = "age")
jage <- jage %>% mutate(perc = round((n.x / n.y) * 100, digits = 1))

ggplot(jage, aes(x = age, y = perc, fill = y, label = perc)) + 
  geom_bar(stat = "identity", position = "dodge", alpha = 0.6) + 
  scale_fill_brewer(palette="Paired") +
  ggtitle("Percentage of subscribers by age") +
  xlab("Age") +
  ylab("% subscribed") +
  theme_bw() +
  theme(legend.title=element_blank())

Upon completing the intial exploratory data analysis, I uploaded the dataset to H2O to run it through a GBM model. The model was trained on eight of the columns in the dataset: age, job, marital, education, default, balance, housing, loan

In [None]:
# Create Vantage connection using tdplyr
con <- td_create_context(host = host,
                         uid = user,
                         pwd = password,
                         dType = 'native'
)

# Set connection context
td_set_context(con)

In [None]:
table_name <- in_schema(database, 'bank_marketing_data')
table <- tbl(con, table_name)

bank <- table %>% select(c('age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'y')) %>% as.data.frame()

bank$age <- as.integer(bank$age)
bank$job <- as.factor(bank$job)
bank$marital <- as.factor(bank$marital)
bank$education <- as.factor(bank$education)
bank$default <- as.factor(bank$default)
bank$balance <- as.integer(bank$balance)
bank$housing <- as.factor(bank$housing)
bank$loan <- as.factor(bank$loan)
bank$y <- as.factor(bank$y)
str(bank)

In [None]:
summary(bank)

Now we must initialize h2o and prepare the training dataset

In [None]:
h2o.init(nthreads = -1)
train_df <- as.h2o(bank)

We split the dataset into train and test

In [None]:
splits <- h2o.splitFrame(train_df, 0.75, seed=1234)
train <- splits[[1]]
test <- splits[[2]]

Identify predictors and response

In [None]:
y <- "y"
x <- setdiff(names(train), y)

Run AutoML for 20 base models

In [None]:
aml <- h2o.automl(x = x, y = y,
                  training_frame = train,
                  max_models = 20,
                  seed = 1)

View the AutoML Leaderboard

In [None]:
lb <- aml@leaderboard
print(lb, n = nrow(lb))  # Print all rows instead of default (6 rows)

In [None]:
model <- aml@leader
model

Now we evaluate the predictions against the test dataframe.

In [None]:
pred <- h2o.predict(model, test)

In [None]:
# Eval performance:
perf <- h2o.performance(model)
perf

The h2o.predict function outputs a data frame with “no” and “yes” prediction columns with the percentage and the prediction column. With this dataframe we can get the accurracy of our model with a confusion matrix.

In [None]:
pred_df <- as.data.frame(pred)
test_df <- as.data.frame(test)
library(caret)
cm <- confusionMatrix(test_df$y, pred_df$predict)
cm
fourfoldplot(cm$table)
#png("artifacts/output/confusion_matrix.png", width = 860, height = 860)

And to get the model metrics

In [None]:
cm$overall