In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
%matplotlib inline

# Bank Marketing Data - A Decision Tree Approach

## Aim:
The aim of this attempt is to predict if the client will subscribe (yes/no) to a term deposit, by building a classification model using Decision Tree.
### Step 1: Load the data
- Load `bank.csv' data
- Check the first five observations
- Check if there are any null values

In [63]:
data = pd.read_csv('bank.csv')

In [64]:
data.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [65]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')

In [66]:
#data.job=='retired'

In [67]:
data.shape

(11162, 17)

In [68]:
data.isnull().sum(axis=0)

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


## Summay of data

### Categorical Variables :
**[1] job      :** admin,technician, services, management, retired, blue-collar, unemployed, entrepreneur,
               housemaid, unknown, self-employed, student
<br>**[2] marital  :** married, single, divorced
<br>**[3] education:** secondary, tertiary, primary, unknown
<br>**[4] default  :** yes, no
<br>**[5] housing  :** yes, no
<br>**[6] loan     :** yes, no 
<br>**[7] deposit  :** yes, no ** (Dependent Variable)**
<br>**[8] contact  :** unknown, cellular, telephone
<br>**[9] month    :** jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec
<br>**[10] poutcome:** unknown, other, failure, success

### Numerical Variables:
**[1] age 
<br>[2] balance
<br>[3] day
<br>[4] duration
<br>[5] campaign
<br>[6] pdays
<br>[7] previous **

### Step 2: Transformer
- Create a trasnformer pipeline for numeric and categorical features. numerical features will be imputed and scaled. Categorical features will be imputed and encoded
- Create a Column transformer

In [18]:
data.dtypes=='object'

age          False
job           True
marital       True
education     True
default       True
balance      False
housing       True
loan          True
contact       True
day          False
month         True
duration     False
campaign     False
pdays        False
previous     False
poutcome      True
deposit       True
dtype: bool

In [19]:
data['default'].dtype

dtype('O')

In [69]:
num_features=[i for i in data.columns if data[i].dtype !='0']

In [70]:
num_features

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'deposit']

In [51]:
numerical_features

age           True
job          False
marital      False
education    False
default      False
balance       True
housing      False
loan         False
contact      False
day           True
month        False
duration      True
campaign      True
pdays         True
previous      True
poutcome     False
deposit      False
dtype: bool

In [96]:
cat_features=[i for i in data.columns if data [i].dtypes !='0']

In [23]:
#cat_features.remove('deposit_enc')

In [24]:
#cat_features.remove('deposit_enc2')

In [25]:
#numerical_features=data.dtypes!='object'

In [72]:
categorical_features=data.dtypes=='object'

In [73]:
categorical_features

age          False
job           True
marital       True
education     True
default       True
balance      False
housing       True
loan          True
contact       True
day          False
month         True
duration     False
campaign     False
pdays        False
previous     False
poutcome      True
deposit       True
dtype: bool

In [74]:
numerical_features=data.dtypes !='object'

In [75]:
num_transformer = Pipeline(steps=[
    ('imputer_n', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])



In [97]:
cat_transformer = Pipeline(steps=[
    ('imputer_c', SimpleImputer(strategy='most_frequent')),  
    ('ohe', OneHotEncoder(handle_unknown='ignore')) 
])

In [99]:
ct = ColumnTransformer(transformers = [
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [100]:
dt = DecisionTreeClassifier(max_depth=4, random_state=0)

In [101]:
mymodel=Pipeline(steps=[('prep',ct),('clt',dt)])

In [102]:
data.info

<bound method DataFrame.info of        age          job  marital  education default  balance housing loan  \
0       59       admin.  married  secondary      no     2343     yes   no   
1       56       admin.  married  secondary      no       45      no   no   
2       41   technician  married  secondary      no     1270     yes   no   
3       55     services  married  secondary      no     2476     yes   no   
4       54       admin.  married   tertiary      no      184      no   no   
...    ...          ...      ...        ...     ...      ...     ...  ...   
11157   33  blue-collar   single    primary      no        1     yes   no   
11158   39     services  married  secondary      no      733      no   no   
11159   32   technician   single  secondary      no       29      no   no   
11160   43   technician  married  secondary      no        0      no  yes   
11161   34   technician  married  secondary      no        0      no   no   

        contact  day month  duration  campa

In [103]:
np.unique(data.deposit,return_counts=1)

(array(['no', 'yes'], dtype=object), array([5873, 5289]))

In [104]:
le = LabelEncoder()

In [105]:
data['deposit_enc'] = le.fit_transform(data.deposit)

In [106]:
data.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,deposit_enc,deposit_enc2
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes,1,1
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes,1,1
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes,1,1
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes,1,1
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes,1,1


In [107]:
data['deposit_enc2'] = data.deposit.map({'yes':1, 'no':0})

In [108]:
y=data.deposit_enc
X=data.drop(["deposit",'deposit_enc', 'deposit_enc2'], axis=1)

In [109]:
X.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42,stratify=y)


In [112]:
mymodel.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe

In [113]:
mymodel.score(X_train,y_train)

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

In [114]:
mymodel.score(X_test,y_test)

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

### Step 4: Model
- Create a pipeline for the decision tree classifier as well as the transformer
- Encode the target variable using `LabelEncoder`

In [115]:
X_train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')

In [118]:
X.dtypes=='object'

age          False
job           True
marital       True
education     True
default       True
balance      False
housing       True
loan          True
contact       True
day          False
month         True
duration     False
campaign     False
pdays        False
previous     False
poutcome      True
dtype: bool

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X,y , random_state=42, test_size=0.2, stratify=y)

In [120]:
model.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe