# MLP -  Bank Marketing for Term Deposit | Adult Census – Income Prediction


## First Case Study: Bank Marketing Data Set
The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).

URL: https://archive.ics.uci.edu/ml/datasets/bank+marketing

In [None]:
# Step 1: Import the neccessary modules
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
import pandas as pd
import math
import datetime
import platform
from sklearn import *


Data set is available as file therefore, I have downloaded files and uploaded to colab environment

In [None]:
# Step 2: Import the Bank Marketing Data Set
df = pd.read_csv("/content/bank-full.csv",sep=";")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


Check the dataset distribution based on people who are going for a term deposits.

In [None]:
df.y.value_counts()/len(df.y)

no     0.883015
yes    0.116985
Name: y, dtype: float64

data is unbalanced.


In [None]:
X = df.copy()
del X["y"]

In [None]:
X = pd.get_dummies(X, drop_first=True) #Convert categorical variable into dummy/indicator variables.
y = np.where(df.y == "yes", 1, 0)

Data split between train (70%) and test (30%)

In [None]:
# Step 3: Split the Dataset into training and test dataset with EDA
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y , test_size=0.3, random_state=1)
scaler = preprocessing.StandardScaler() #Standardize features by removing the mean and scaling to unit variance.
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

y_train = tf.keras.utils.to_categorical(y_train) #Converts a class vector (integers) to binary class matrix.
y_test = tf.keras.utils.to_categorical(y_test)

In [None]:
# Step 4: Build the Neural network
n_x = X_train.shape[1]
num_classes = 2
batch_size = 16
epochs = 10

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(200, activation='relu', input_shape=(n_x,)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(200, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 200)               8600      
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense_4 (Dense)             (None, 200)               40200     
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense_5 (Dense)             (None, 2)                 402       
                                                                 
Total params: 49,202
Trainable params: 49,202
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Step 5: Compile the model using adam optimizer
# The compilation process defines the optimizer, the loss function and the metrics
# Here, we set the parameters of the backpropagation algorithm

model.compile(loss= tf.keras.losses.categorical_crossentropy,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=[tf.keras.metrics.categorical_accuracy])

In [None]:
# Step 6: Train the model
# In this step, we train the model with the backpropagation algorithm by feeding the inputs through the model.
history = model.fit(X_train_std, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test_std, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Step 7: Evaluate the Models
# We check the peformance of the model based on the test dataset.

score = model.evaluate(X_test_std, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.20584268867969513
Test accuracy: 0.9066647291183472


## Accuracy for Bank marketing Data Set predicting client will subscribe for a term deposit or not is 90%

## Second Case Study: Adult Data Set

 Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.

URL: https://archive.ics.uci.edu/ml/datasets/Adult


Data set is available as file therefore, I have downloaded files and uploaded to colab environment

In [None]:
# Step 2: Import the Bank Marketing Data Set
cols = ['age','workclass','fnlwgt','education','education.num','marital.status','occupation','relationship','race','sex','capital.gain','capital.loss','hours.per.week','native.country','income']
df2 = pd.read_csv("/content/adult.data", names=cols, skipinitialspace = True)
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
#EDA
# Check for Null Data
df2.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [None]:
df2 = df2.fillna(np.nan)#Fill NA/NaN values using the specified method.
df2

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
df2.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

In [None]:
df2['income']=df2['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})
df2

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [None]:
df2["workclass"] = df2["workclass"].fillna("X")
df2["occupation"] = df2["occupation"].fillna("X")
df2["native.country"] = df2["native.country"].fillna("United-States")

# Confirm All Missing Data is Handled
df2.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [None]:
df2["sex"] = df2["sex"].map({"Male": 0, "Female":1})

# Create Married Column - Binary Yes(1) or No(0)
df2["marital.status"] = df2["marital.status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
df2["marital.status"] = df2["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
df2["marital.status"] = df2["marital.status"].map({"Married":1, "Single":0})
df2["marital.status"] = df2["marital.status"].astype(int)

In [None]:
numeric_features = ['age','fnlwgt','education.num','capital.gain','capital.loss','hours.per.week','marital.status', 'sex']

# Identify Categorical features
cat_features = ['education', 'relationship', 'race', 'native.country']

In [None]:
# Step 3: Split the Dataset into training and test dataset
train, test = model_selection.train_test_split(df2, test_size=0.2)
train, val =  model_selection.train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

20838 train examples
5210 validation examples
6513 test examples


In [None]:
feature_columns = []

# numeric cols
for header in numeric_features:
  feature_columns.append(tf.feature_column.numeric_column(header))

feature_columns

[NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='fnlwgt', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='education.num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='capital.gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='capital.loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='hours.per.week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='marital.status', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='sex', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [None]:
df2["workclass"] = df2["workclass"].replace('?', 'X')
df2["occupation"] = df2["occupation"].replace('?', 'X')
df2.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,0,Adm-clerical,Not-in-family,White,0,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,1,Exec-managerial,Husband,White,0,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,0,Handlers-cleaners,Not-in-family,White,0,0,0,40,United-States,0
3,53,Private,234721,11th,7,1,Handlers-cleaners,Husband,Black,0,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,1,Prof-specialty,Wife,Black,1,0,0,40,Cuba,0


In [None]:
df2.drop(labels=["workclass","occupation"], axis = 1, inplace = True)
print('Dataset with Dropped Labels')
print(df2.head())

Dataset with Dropped Labels
   age  fnlwgt  education  education.num  marital.status   relationship  \
0   39   77516  Bachelors             13               0  Not-in-family   
1   50   83311  Bachelors             13               1        Husband   
2   38  215646    HS-grad              9               0  Not-in-family   
3   53  234721       11th              7               1        Husband   
4   28  338409  Bachelors             13               1           Wife   

    race  sex  capital.gain  capital.loss  hours.per.week native.country  \
0  White    0          2174             0              40  United-States   
1  White    0             0             0              13  United-States   
2  White    0             0             0              40  United-States   
3  Black    0             0             0              40  United-States   
4  Black    1             0             0              40           Cuba   

   income  
0       0  
1       0  
2       0  
3       0  
4   

In [None]:
for feature in cat_features:
    l = df2[feature].unique()
#     print(l)
    f = tf.feature_column.categorical_column_with_vocabulary_list(feature,l)
    one_hot = tf.feature_column.indicator_column(f)
    feature_columns.append(one_hot)

print(feature_columns)

[NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='fnlwgt', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='education.num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='capital.gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='capital.loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='hours.per.week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='marital.status', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='sex', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='education', vocabulary_list=('Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('income')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
# Step 4: Build the Neural network
model = tf.keras.Sequential([
  feature_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1)
])

In [None]:
# Step 5: Compile the model using adam optimizer
# The compilation process defines the optimizer, the loss function and the metrics
# Here, we set the parameters of the backpropagation algorithm

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [None]:
# Step 6: Train the model
# In this step, we train the model with the backpropagation algorithm by feeding the inputs through the model.
model.fit(train_ds,
          validation_data=val_ds,
          epochs=25)

Epoch 1/25








Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f2c7c479750>

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7899585366249084


## Accuracy for adult-census-income is 80%