# Import data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('bank-full.csv', sep = ';')

# Examine data

In [3]:
df.shape

(45211, 17)

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.describe(include='all')

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,,,unknown,no
freq,,9732,27214,23202,44396,,25130,37967,29285,,13766,,,,,36959,39922
mean,40.93621,,,,,1362.272058,,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
std,10.618762,,,,,3044.765829,,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
min,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
25%,33.0,,,,,72.0,,,,8.0,,103.0,1.0,-1.0,0.0,,
50%,39.0,,,,,448.0,,,,16.0,,180.0,2.0,-1.0,0.0,,
75%,48.0,,,,,1428.0,,,,21.0,,319.0,3.0,-1.0,0.0,,


In [6]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


**Observations**

 - There are no nulls.
 - The data is as expected from bank-names.txt

# Feature Engineering

In [7]:
# Get dummies dataframe on categorical data
job_dummies = pd.get_dummies(df.job)
print(job_dummies.columns)
job_dummies = job_dummies.add_prefix('job_')
job_dummies.drop('job_unknown', axis=1, inplace=True)
job_dummies.head()

Index(['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
       'retired', 'self-employed', 'services', 'student', 'technician',
       'unemployed', 'unknown'],
      dtype='object')


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed
0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0


In [8]:
marital_dummies = pd.get_dummies(df.marital)
print(marital_dummies.columns)
marital_dummies = marital_dummies.add_prefix('marital_')
marital_dummies.drop('marital_divorced', axis=1, inplace=True)
marital_dummies.head()

Index(['divorced', 'married', 'single'], dtype='object')


Unnamed: 0,marital_married,marital_single
0,1,0
1,0,1
2,1,0
3,1,0
4,0,1


In [9]:
education_dummies = pd.get_dummies(df.education)
print(education_dummies.columns)
education_dummies = education_dummies.add_prefix('education_')
education_dummies.drop('education_unknown', axis=1, inplace=True)
education_dummies.head()

Index(['primary', 'secondary', 'tertiary', 'unknown'], dtype='object')


Unnamed: 0,education_primary,education_secondary,education_tertiary
0,0,0,1
1,0,1,0
2,0,1,0
3,0,0,0
4,0,0,0


In [10]:
default_dummies = pd.get_dummies(df.default)
print(default_dummies.columns)
default_dummies = default_dummies.add_prefix('default_')
default_dummies.drop('default_no', axis=1, inplace=True)
default_dummies.head()

Index(['no', 'yes'], dtype='object')


Unnamed: 0,default_yes
0,0
1,0
2,0
3,0
4,0


In [11]:
loan_dummies = pd.get_dummies(df.loan)
print(loan_dummies.columns)
loan_dummies = loan_dummies.add_prefix('loan_')
loan_dummies.drop('loan_no', axis=1, inplace=True)
loan_dummies.head()

Index(['no', 'yes'], dtype='object')


Unnamed: 0,loan_yes
0,0
1,0
2,1
3,0
4,0


In [12]:
contact_dummies = pd.get_dummies(df.contact)
print(contact_dummies.columns)
contact_dummies = contact_dummies.add_prefix('contact_')
contact_dummies.drop('contact_unknown', axis=1, inplace=True)
contact_dummies.head()

Index(['cellular', 'telephone', 'unknown'], dtype='object')


Unnamed: 0,contact_cellular,contact_telephone
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [13]:
month_dummies = pd.get_dummies(df.month)
print(month_dummies.columns)
month_dummies = month_dummies.add_prefix('month_')
month_dummies.drop('month_dec', axis=1, inplace=True)
month_dummies.head()

Index(['apr', 'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov',
       'oct', 'sep'],
      dtype='object')


Unnamed: 0,month_apr,month_aug,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0


In [14]:
contact_dummies = pd.get_dummies(df.contact)
print(contact_dummies.columns)
contact_dummies = contact_dummies.add_prefix('contact_')
contact_dummies.drop('contact_unknown', axis=1, inplace=True)
contact_dummies.head()

Index(['cellular', 'telephone', 'unknown'], dtype='object')


Unnamed: 0,contact_cellular,contact_telephone
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [15]:
poutcome_dummies = pd.get_dummies(df.poutcome)
print(poutcome_dummies.columns)
poutcome_dummies = poutcome_dummies.add_prefix('poutcome_')
poutcome_dummies.drop('poutcome_unknown', axis=1, inplace=True)
poutcome_dummies.head()

Index(['failure', 'other', 'success', 'unknown'], dtype='object')


Unnamed: 0,poutcome_failure,poutcome_other,poutcome_success
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [16]:
# Combined DF of all categorical cols
categorical_cols = pd.concat([job_dummies, marital_dummies, education_dummies,
                              default_dummies, loan_dummies, contact_dummies, 
                             poutcome_dummies, month_dummies],
                            axis=1)
print(categorical_cols.shape)
categorical_cols.head()

(45211, 34)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_aug,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [17]:
df = pd.concat([df, categorical_cols], axis=1)
print(df.shape)
df.head()

(45211, 51)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,month_aug,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,0,0,0,0,0,0,1,0,0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0


In [18]:
print(df.dtypes)
print(df.columns)

age                     int64
job                    object
marital                object
education              object
default                object
balance                 int64
housing                object
loan                   object
contact                object
day                     int64
month                  object
duration                int64
campaign                int64
pdays                   int64
previous                int64
poutcome               object
y                      object
job_admin.              uint8
job_blue-collar         uint8
job_entrepreneur        uint8
job_housemaid           uint8
job_management          uint8
job_retired             uint8
job_self-employed       uint8
job_services            uint8
job_student             uint8
job_technician          uint8
job_unemployed          uint8
marital_married         uint8
marital_single          uint8
education_primary       uint8
education_secondary     uint8
education_tertiary      uint8
default_ye

In [19]:
X = df[['age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'default_yes', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'poutcome_failure', 'poutcome_other', 'poutcome_success', 'month_apr',
       'month_aug', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep']]
X.dtypes

age                    int64
balance                int64
day                    int64
duration               int64
campaign               int64
pdays                  int64
previous               int64
job_admin.             uint8
job_blue-collar        uint8
job_entrepreneur       uint8
job_housemaid          uint8
job_management         uint8
job_retired            uint8
job_self-employed      uint8
job_services           uint8
job_student            uint8
job_technician         uint8
job_unemployed         uint8
marital_married        uint8
marital_single         uint8
education_primary      uint8
education_secondary    uint8
education_tertiary     uint8
default_yes            uint8
loan_yes               uint8
contact_cellular       uint8
contact_telephone      uint8
poutcome_failure       uint8
poutcome_other         uint8
poutcome_success       uint8
month_apr              uint8
month_aug              uint8
month_feb              uint8
month_jan              uint8
month_jul     

In [20]:
y = df['y']

# Model creation

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.20)

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# Evaluation

In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [24]:
# Train data predictions
train_y_pred = rf.predict(X_train)

In [25]:
# Testing data predictions
test_y_pred = rf.predict(X_test)

In [26]:
print(accuracy_score(y_train, train_y_pred))
#print( precision_score(y_train, train_y_pred))
#print( recall_score(y_train, train_y_pred))
print(confusion_matrix(y_train, train_y_pred))

0.992507188675
[[31935    13]
 [  258  3962]]


In [27]:
print(accuracy_score(y_test, test_y_pred))
#print( precision_score(y_test, test_y_pred))
#print( recall_score(y_test, test_y_pred))
print(confusion_matrix(y_test, test_y_pred))

0.898595598806
[[7765  209]
 [ 708  361]]


In [29]:
# Feature importances
sorted(list ( zip(rf.feature_importances_.round(5), X_train.columns.tolist() ) ), reverse=True)

[(0.26425999999999999, 'duration'),
 (0.10401000000000001, 'balance'),
 (0.097670000000000007, 'age'),
 (0.088969999999999994, 'day'),
 (0.055140000000000002, 'pdays'),
 (0.04616, 'poutcome_success'),
 (0.03805, 'campaign'),
 (0.018880000000000001, 'previous'),
 (0.01465, 'month_mar'),
 (0.014409999999999999, 'poutcome_failure'),
 (0.0137, 'contact_cellular'),
 (0.013299999999999999, 'education_secondary'),
 (0.01184, 'month_jun'),
 (0.01183, 'education_tertiary'),
 (0.011650000000000001, 'marital_married'),
 (0.01094, 'job_technician'),
 (0.010869999999999999, 'poutcome_other'),
 (0.010829999999999999, 'month_may'),
 (0.010670000000000001, 'month_aug'),
 (0.01047, 'job_management'),
 (0.010330000000000001, 'month_apr'),
 (0.010240000000000001, 'marital_single'),
 (0.0099100000000000004, 'month_oct'),
 (0.0094199999999999996, 'loan_yes'),
 (0.0088299999999999993, 'month_jul'),
 (0.0085400000000000007, 'month_nov'),
 (0.0083199999999999993, 'job_admin.'),
 (0.0082100000000000003, 'month

# Save model

In [31]:
import pickle
with open('../bank-marketing-sklearn-rf.pkl', 'wb') as fid:
    pickle.dump(rf, fid, 2)

# Outcome data

In [32]:
(df.head(1))

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,month_aug,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0


In [33]:
input_df = pd.DataFrame()

In [90]:
col_names = ['age', 'job', 'marital',
             'education', 'default', 'balance', 
             'housing', 'loan', 'contact',
             'day', 'month', 'duration',
             'campaign', 'pdays', 'previous', 
             'poutcome']
input_vec = [58, 'management', 'married', 'tertiary', 'no', 2143, 
            'yes', 'no', 'unknown', 5, 'may', 261, 1, -1, 0, 'unknown']

In [91]:
input_df = pd.DataFrame(input_vec, col_names)

In [92]:
input_df = input_df.transpose()

In [93]:
job_list = ['job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 
          'job_management', 'job_retired',
       'job_self-employed', 'job_services', 
          'job_student', 'job_technician',
       'job_unemployed'] 
for job in job_list:
    input_df[job] = 0

In [94]:
marital_list = ['marital_divorced', 'marital_married', 'marital_single']
for marital in marital_list:
    input_df[marital] = 0

In [95]:
edu_list = ['education_primary', 'education_secondary', 'education_tertiary']
for edu in edu_list:
    input_df[edu] = 0

In [96]:
def_list = ['default_yes']
for default in def_list:
    input_df[default] = 0

In [97]:
loan_list = ['loan_yes']
for loan in loan_list:
    input_df[loan] = 0

In [98]:
cont_list = [ 'contact_cellular', 'contact_telephone']
for cont in cont_list:
    input_df[cont] = 0

In [99]:
pout_list = ['poutcome_failure', 'poutcome_other', 'poutcome_success']
for pout in pout_list:
    input_df[pout] = 0

In [100]:
month = ['month_apr', 'month_aug', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep']
for month in month_list:
    input_df[month] = 0

In [101]:
def determine_marital(input_df):
    if input_df['marital'][0]=="divorced":
        input_df['marital_divorced'] = 1 
    elif input_df['marital'][0]=="married":
        input_df['marital_married'] = 1
    elif input_df['marital'][0]=="single":
        input_df['marital_single']= 1
    else:
        pass

In [102]:
def determine_job(input_df):
    if input_df['job'][0]=="admin.":
        input_df['job_admin.'] = 1 
    elif input_df['job'][0]=="unemployed":
        input_df['job_unemployed'] = 1
    elif input_df['job'][0]=="management":
        input_df['job_management']= 1
    elif input_df['job'][0]=="housemaid":
        input_df['job_housemaid']=1
    elif input_df['job'][0]=="entrepreneur":
        input_df['job_entrepreneur']=1 
    elif input_df['job'][0]=="student":
        input_df['job_student']=1 
    elif input_df['job'][0]=="blue-collar":
        input_df['job_blue-collar']=1 
    elif input_df['job'][0]=="self-employed":
        input_df['job_self-employed']=1 
    elif input_df['job'][0]=="retired":
        input_df['job_retired']=1 
    elif input_df['job'][0]=="technician":
        input_df['job_technician']=1 
    elif input_df['job'][0]=="services":
        input_df[ 'job_services']=1 
    else:
        pass

In [103]:
#'education_primary', 'education_secondary', 'education_tertiary'
def determine_edu(input_df):
    if input_df['job'][0]=="primary":
        input_df['education_primary'] = 1 
    elif input_df['job'][0]=="secondary":
        input_df['education_secondary'] = 1
    elif input_df['job'][0]=="tertiary":
        input_df['education_tertiary']= 1
    else:
        pass

In [104]:

def determine_default(input_df):
    if input_df['default'][0]=="yes":
        input_df['def_yes'] = 1 
    else:
        pass

In [105]:

def determine_loan(input_df):
    if input_df['loan'][0]=="yes":
        input_df['loan_yes'] = 1 
    else:
        pass

In [106]:
# 'contact_cellular', 'contact_telephone'
def determine_contact(input_df):
    if input_df['contact'][0]=="cellular":
        input_df['contact_cellular'] = 1 
    elif input_df['contact'][0]=="telephone":
        input_df['contact_telephone'] = 1 
    else:
        pass

In [107]:
# 'poutcome_failure', 'poutcome_other', 'poutcome_success'
def determine_poutcome(input_df):
    if input_df['poutcome'][0]=="failure":
        input_df['poutcome_failure'] = 1 
    elif input_df['poutcome'][0]=="other":
        input_df['poutcome_other'] = 1 
    elif input_df['poutcome'][0]=="success":
        input_df['poutcome_success'] = 1 
    else:
        pass

In [108]:

def determine_month(input_df):
    if input_df['month'][0]=="aug":
        input_df['month_aug'] = 1 
    elif input_df['month'][0]=="feb":
        input_df['month_feb'] = 1 
    elif input_df['month'][0]=="jan":
        input_df['month_jan'] = 1 
        
    elif input_df['month'][0]=="jul":
        input_df['month_jul'] = 1 
    elif input_df['month'][0]=="jun":
        input_df['month_jun'] = 1 
    elif input_df['month'][0]=="mar":
        input_df['month_mar'] = 1 
        
    elif input_df['month'][0]=="may":
        input_df['month_may'] = 1 
    elif input_df['month'][0]=="nov":
        input_df['month_nov'] = 1 
    elif input_df['month'][0]=="oct":
        input_df['month_oct'] = 1 
        
    elif input_df['month'][0]=="sep":
        input_df['month_sep'] = 1 
    elif input_df['month'][0]=="apr":
        input_df['month_apr'] = 1 
    else:
        pass

In [110]:
input_df = input_df[['age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'default_yes', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'poutcome_failure', 'poutcome_other', 'poutcome_success', 'month_apr',
       'month_aug', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep']]

In [114]:
pkl_file = open('rfmodel.pkl', 'rb')
rf = pickle.load(pkl_file)
prediction = rf.predict(input_df.values)

In [116]:
prediction[0]

'no'