# First Data Science Project




### Bank Direct Marketing:
- Business Objective: improve direct-marketing efficiency
- Situation: human agents contacting customers via telephone or internet online banking to offer long-term deposits
- Data Mining Goal: idenfity customer charateristics that affect the success of direct marketing


## Project Pipeline:

A project pipeline allows developer to build a roadmap from the development stage to the production stage. In this project, we are building the pipeline with the following steps:

1. Create the application and model codes on the Dev machine
2. Create repository on GitHub to store the codes
3. Launch the codes on Heroku server



In [1]:
# Import dependencies
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# Load data
df = pd.read_csv("./bankData/bank.csv", header = None,
                names=['age', 'job', 'marital', 'education', 'default', 'balance', 
                      'housing', 'loan', 'contact', 'day', 'month', 'duration',
                      'campaign', 'pdays', 'previous', 'poutcome', 'y'])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [3]:
# Drop campaign related columns
df.drop(df.iloc[:, 8:16], inplace = True, axis = 1)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,y
0,30,unemployed,married,primary,no,1787,no,no,no
1,33,services,married,secondary,no,4789,yes,yes,no
2,35,management,single,tertiary,no,1350,yes,no,no
3,30,management,married,tertiary,no,1476,yes,yes,no
4,59,blue-collar,married,secondary,no,0,yes,no,no


In [4]:
# Extract numeric features 
numeric_data = df.iloc[:, [0, 5]].values
numeric_df = pd.DataFrame(numeric_data, dtype = object)
numeric_df.columns = ['age', 'balance']

In [5]:
# Standard scaling age
age_std_scale = StandardScaler()
numeric_df['age'] = age_std_scale.fit_transform(numeric_df[['age']])
#standard scaling balance
balance_std_scale = StandardScaler()
numeric_df['balance'] = balance_std_scale.fit_transform(numeric_df[['balance']])

numeric_df.head()

Unnamed: 0,age,balance
0,-1.05627,0.121072
1,-0.772583,1.118644
2,-0.583458,-0.024144
3,-1.05627,0.017726
4,1.686036,-0.472753


In [6]:
# Extract categoric features
X_categoric = df.iloc[:, [1,2,3,4,6,7]].values

In [7]:
# One hot encoding for categorical data
ohe = OneHotEncoder()
categoric_data = ohe.fit_transform(X_categoric).toarray()
categoric_df = pd.DataFrame(categoric_data)
categoric_df.columns = ohe.get_feature_names()
categoric_df.head()

Unnamed: 0,x0_admin.,x0_blue-collar,x0_entrepreneur,x0_housemaid,x0_management,x0_retired,x0_self-employed,x0_services,x0_student,x0_technician,...,x2_primary,x2_secondary,x2_tertiary,x2_unknown,x3_no,x3_yes,x4_no,x4_yes,x5_no,x5_yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [8]:
# Combine numeric and categorix
X_final = pd.concat([numeric_df, categoric_df], axis = 1)
X_final.head()

Unnamed: 0,age,balance,x0_admin.,x0_blue-collar,x0_entrepreneur,x0_housemaid,x0_management,x0_retired,x0_self-employed,x0_services,...,x2_primary,x2_secondary,x2_tertiary,x2_unknown,x3_no,x3_yes,x4_no,x4_yes,x5_no,x5_yes
0,-1.05627,0.121072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,-0.772583,1.118644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,-0.583458,-0.024144,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,-1.05627,0.017726,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,1.686036,-0.472753,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [9]:
# Test-Train Split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size = 0.2, random_state = 42)

In [10]:
# Train model
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [11]:
# Predicting the test set results
y_hat = rfc.predict(X_test)

# Confusion Matrix
confusion_matrix(y_test, y_hat)

array([[786,  21],
       [ 94,   4]])

In [12]:
# Accuracy Score
accuracy_score(y_test, y_hat)

0.8729281767955801

In [13]:
# Saving model to disk
pickle.dump(rfc, open('classifier_model.pkl', "wb"))

In [14]:
# Loading model to compare the results
model = pickle.load(open('classifier_model.pkl', 'rb'))

# Avg predictors
avg = []

for i in X_test.mean():
    avg.append(i)

print(avg)

# Predict result
print(model.predict([avg]))

[-0.0225628812291069, 0.008867035736056747, 0.1292817679558011, 0.20883977900552486, 0.045303867403314914, 0.023204419889502764, 0.2165745856353591, 0.03756906077348066, 0.036464088397790057, 0.09171270718232044, 0.015469613259668509, 0.1580110497237569, 0.030939226519337018, 0.0066298342541436465, 0.11602209944751381, 0.6154696132596685, 0.26850828729281767, 0.14917127071823205, 0.5038674033149171, 0.3060773480662983, 0.04088397790055249, 0.9767955801104973, 0.023204419889502764, 0.4088397790055249, 0.5911602209944752, 0.8486187845303867, 0.15138121546961325]
['no']
