In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import altair as alt

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split


%matplotlib inline

In [2]:
df = pd.read_csv('/Users/mbusch/OneDrive - Regis University/MSDS/680/MSDS680_ncg_S8W1_18/datasets/bank-full.csv',  delimiter=';')

### Input variables:
   **bank client data:**
   
    1 - age (numeric)
    2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                       "blue-collar","self-employed","retired","technician","services") 
    3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
    4 - education (categorical: "unknown","secondary","primary","tertiary")
    5 - default: has credit in default? (binary: "yes","no")
    6 - balance: average yearly balance, in euros (numeric) 
    7 - housing: has housing loan? (binary: "yes","no")
    8 - loan: has personal loan? (binary: "yes","no")
**related with the last contact of the current campaign:**

    9 - contact: contact communication type (categorical: "unknown","telephone","cellular") 
    10 - day: last contact day of the month (numeric)
    11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
    12 - duration: last contact duration, in seconds (numeric)
**other attributes:**

    13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
    14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
    15 - previous: number of contacts performed before this campaign and for this client (numeric)
    16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

**Output variable (desired target):**

    17 - y - has the client subscribed a term deposit? (binary: "yes","no")


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.rename(columns={'y':'class'},inplace=True)

In [6]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'class'],
      dtype='object')

In [7]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
class        object
dtype: object

check the number of levels that each of the five categorical variables have.

In [8]:
cat_list = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome' ,'class']

# for cat in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome' ,'class']:

[print(f"{cat}: \b {df[cat].unique().size:2.2f} ") for cat in cat_list]#.format(cat, df[cat].unique().size))

job: 12.00 
marital: 3.00 
education: 4.00 
default: 2.00 
housing: 2.00 
loan: 2.00 
contact: 3.00 
month: 12.00 
poutcome: 4.00 
class: 2.00 


[None, None, None, None, None, None, None, None, None, None]

In [9]:
for cat in ['contact', 'poutcome','class', 'marital', 'default', 'housing', 'loan']:
    print("Levels for catgeory '{0}': {1}".format(cat, df[cat].unique()))

Levels for catgeory 'contact': ['unknown' 'cellular' 'telephone']
Levels for catgeory 'poutcome': ['unknown' 'failure' 'other' 'success']
Levels for catgeory 'class': ['no' 'yes']
Levels for catgeory 'marital': ['married' 'single' 'divorced']
Levels for catgeory 'default': ['no' 'yes']
Levels for catgeory 'housing': ['yes' 'no']
Levels for catgeory 'loan': ['no' 'yes']


In [10]:
df['marital']  = df['marital'].map({'married':0,'single':1,'divorced':2,'unknown':3})
df['default']  = df['default'].map({'no':0,'yes':1,'unknown':2})
df['housing']  = df['housing'].map({'no':0,'yes':1,'unknown':2})
df['loan']     = df['loan'].map({'no':0,'yes':1,'unknown':2})
df['contact']  = df['contact'].map({'telephone':0,'cellular':1})
df['poutcome'] = df['poutcome'].map({'nonexistent':0,'failure':1,'success':2})
df['class']    = df['class'].map({'no':0,'yes':1})

In [11]:
df = df.fillna(-999)
pd.isnull(df).any()

age          False
job          False
marital      False
education    False
default      False
balance      False
housing      False
loan         False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
class        False
dtype: bool

For other categorical variables, we encode the levels as digits using Scikit-learn's MultiLabelBinarizer and treat them as new features.

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

job_Trans         = mlb.fit_transform([{str(val)} for val in df['job'].values])
education_Trans   = mlb.fit_transform([{str(val)} for val in df['education'].values])
month_Trans       = mlb.fit_transform([{str(val)} for val in df['month'].values])
# day_of_week_Trans = mlb.fit_transform([{str(val)} for val in df['day_of_week'].values])


In [13]:
df_new = df.drop(['marital','default','housing','loan','contact','poutcome','class','job','education','month'], axis=1)

In [14]:
df_new = np.hstack((df_new.values, job_Trans, education_Trans, month_Trans))


In [15]:
np.isnan(df_new).any()

False

Keeping in mind that the final dataset is in the form of a numpy array, we can check the number of features in the final dataset as follows.

In [16]:
df_new[0].size

35

Finally we store the class labels, which we need to predict, in a separate variable.

In [17]:
df_class = df['class'].values


# Data Analysis using TPOT

To begin our analysis, we need to divide our training data into training and validation sets. The validation set is just to give us an idea of the test set error. The model selection and tuning is entirely taken care of by TPOT, so if we want to, we can skip creating this validation set.


In [None]:
training_indices, validation_indices = training_indices, testing_indices = train_test_split(df.index, stratify = df_class, train_size=0.75, test_size=0.25)
training_indices.size, validation_indices.size

(33908, 11303)

After that, we proceed to calling the fit(), score() and export() functions on our training dataset. An important TPOT parameter to set is the number of generations (via the generations kwarg). Since our aim is to just illustrate the use of TPOT, we assume the default setting of 100 generations, whilst bounding the total running time via the max_time_mins kwarg (which may, essentially, override the former setting). Further, we enable control for the maximum amount of time allowed for optimization of a single pipeline, via max_eval_time_mins.

On a standard laptop with 4GB RAM, each generation takes approximately 5 minutes to run. Thus, for the default value of 100, without the explicit duration bound, the total run time could be roughly around 8 hours.


In [None]:
%%time
tpot = TPOTClassifier(max_eval_time_mins=1, population_size=20, verbosity=2, n_jobs=-1) #(verbosity=2, max_time_mins=20, max_eval_time_mins=0.1, population_size=15)
tpot.fit(df_new[training_indices], df_class[training_indices])

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120), HTML(value='')))