In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt

In [3]:
import pandas as pd

In [4]:
import statsmodels.api as sm

# 0. Data

Source & description: https://archive.ics.uci.edu/dataset/222/bank+marketing

In [6]:
df = pd.read_csv("./data/convert-data.csv", sep=";")

In [7]:
sorted(df.columns)

['age',
 'balance',
 'campaign',
 'contact',
 'day',
 'default',
 'duration',
 'education',
 'housing',
 'job',
 'loan',
 'marital',
 'month',
 'pdays',
 'poutcome',
 'previous',
 'y']

In [8]:
df.tail()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,no


In [9]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


# 1. Target Variable

In [10]:
df["conversion"] = df["y"].apply(lambda x: 1 if x == "yes" else 0)

In [11]:
df["conversion"].mean()

0.11698480458295547

# 2. Decision Tree / Interaction Analysis

## 2.1. Continuous Variables

In [12]:
continuous_vars = [
    "age", "balance", "duration", "campaign", "previous"
]

In [13]:
df[continuous_vars].describe()

Unnamed: 0,age,balance,duration,campaign,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,258.16308,2.763841,0.580323
std,10.618762,3044.765829,257.527812,3.098021,2.303441
min,18.0,-8019.0,0.0,1.0,0.0
25%,33.0,72.0,103.0,1.0,0.0
50%,39.0,448.0,180.0,2.0,0.0
75%,48.0,1428.0,319.0,3.0,0.0
max,95.0,102127.0,4918.0,63.0,275.0


## 2.2. Categorical Variables

Encoding Month

In [14]:
df['month'].unique()

array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
       'mar', 'apr', 'sep'], dtype=object)

In [15]:
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

df['month'] = df['month'].apply(
    lambda x: months.index(x)+1
)

In [16]:
df.groupby('month').count()['conversion']

month
1      1403
2      2649
3       477
4      2932
5     13766
6      5341
7      6895
8      6247
9       579
10      738
11     3970
12      214
Name: conversion, dtype: int64

Encoing Job

In [17]:
df['job'].unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

In [18]:
jobs_encoded_df = pd.get_dummies(df['job'])
jobs_encoded_df.columns = ['job_%s' % x for x in jobs_encoded_df.columns]

In [19]:
jobs_encoded_df.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,False,False,False,False,True,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False
2,False,False,True,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True


In [20]:
df = pd.concat([df, jobs_encoded_df], axis=1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,False,False,True,False,False,False,False,False,False,False
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,False,False,False,False,False,False,False,True,False,False
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,True,False,False,False,False,False,False,False,False,False
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,False,False,False,False,False,False,False,False,False,False
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,False,False,False,False,False,False,False,False,False,True


Encoding Marital

In [21]:
marital_encoded_df = pd.get_dummies(df['marital'])
marital_encoded_df.columns = ['marital_%s' % x for x in marital_encoded_df.columns]

In [22]:
marital_encoded_df.head()

Unnamed: 0,marital_divorced,marital_married,marital_single
0,False,True,False
1,False,False,True
2,False,True,False
3,False,True,False
4,False,False,True


In [23]:
df = pd.concat([df, marital_encoded_df], axis=1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,False,False,False,False,False,False,False,False,True,False
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,False,False,False,False,True,False,False,False,False,True
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,False,False,False,False,False,False,False,False,True,False
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,False,False,False,False,False,False,False,False,True,False
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,False,False,False,False,False,False,True,False,False,True


Encoding Housing

In [24]:
df['housing'].unique()

array(['yes', 'no'], dtype=object)

In [25]:
df['housing'] = df['housing'].apply(lambda x: 1 if x == 'yes' else 0)

## 2.3. Fitting Decition Tree

In [26]:
from sklearn import tree

In [27]:
features = (
    continuous_vars 
    + ["housing", "month"]
    + list(jobs_encoded_df.columns) 
    + list(marital_encoded_df.columns)
)
response_var = 'conversion'

In [28]:
features

['age',
 'balance',
 'duration',
 'campaign',
 'previous',
 'housing',
 'month',
 'job_admin.',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'job_unknown',
 'marital_divorced',
 'marital_married',
 'marital_single']

In [45]:
dt_model = tree.DecisionTreeClassifier(
    max_depth=3
)

In [46]:
dt_model.fit(df[features], df[response_var])

In [47]:
dt_model.classes_

array([0, 1])

## 2.4. Interpreting Decision Tree

In [48]:
# conda install python-graphviz

In [49]:
import graphviz

In [50]:
dot_data = tree.export_graphviz(
    dt_model, 
    out_file=None, 
    feature_names=features,  
    class_names=['0', '1'],  
    filled=True, 
    rounded=True,  
    special_characters=True
) 

In [51]:
graph = graphviz.Source(dot_data, format="png")

In [52]:
graph.render("conversion-dt-depth-3")

'conversion-dt-depth-3.png'