In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
source_data='source_data/'
data_save_path='data_with_id/'
split_path='Train_Test_Split/'
model_and_scaler='model_and_scaler/'
ML_results='ML_results/'
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
data=pd.read_csv(source_data+'bank-full.csv', sep = ';')
data.shape

(45211, 17)

In [3]:
# #DATA DICT (http://archive.ics.uci.edu/ml/datasets/Bank+Marketing#)
# Attribute Information:

# Input variables:
# # bank client data:
# 1 - age (numeric)
# 2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
# 3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
# 4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
# 5 - default: has credit in default? (categorical: 'no','yes','unknown')
# 6 - housing: has housing loan? (categorical: 'no','yes','unknown')
# 7 - loan: has personal loan? (categorical: 'no','yes','unknown')
# # related with the last contact of the current campaign:
# 8 - contact: contact communication type (categorical: 'cellular','telephone')
# 9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
# 10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
# 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# # other attributes:
# 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
# 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
# 14 - previous: number of contacts performed before this campaign and for this client (numeric)
# 15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
# # social and economic context attributes
# 16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
# 17 - cons.price.idx: consumer price index - monthly indicator (numeric)
# 18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
# 19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
# 20 - nr.employed: number of employees - quarterly indicator (numeric)

# Output variable (desired target):
# 21 - y - has the client subscribed a term deposit? (binary: 'yes','no')



In [4]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [6]:
data['id'] = data.index
data['id'].nunique()
data_copy=data.copy()


data_to_save=data_copy.loc[:, data_copy.columns != 'y']
data_to_save.to_csv(data_save_path+'data_with_id.csv')
with open(data_save_path+'target.pickle', 'wb') as handle:
    pickle.dump(data_copy['y'], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
y=data_copy['y']
X=data_copy.loc[:, data_copy.columns != "y"]
Id=data_copy['id']

In [8]:
obj_cols_list=[]
for col in X.columns:
    if (X[col].dtypes == object):
        obj_cols_list.append(col)

In [9]:
obj_cols_list

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [10]:
X = pd.get_dummies(X, columns = obj_cols_list)
X

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,id,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0,45206,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
45207,71,1729,17,456,2,-1,0,45207,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
45208,72,5715,17,1127,5,184,3,45208,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
45209,57,668,17,508,4,-1,0,45209,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [11]:
X.shape

(45211, 52)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.30, random_state=5)

X_train.to_csv(split_path+'X_train.csv')
y_train.to_csv(split_path+'y_train.csv')
X_test.to_csv(split_path+'X_test.csv')
y_test.to_csv(split_path+'y_test.csv')

X_train_before_scaler=X_train.copy()
X_test_before_scaler=X_test.copy()
y_train_before_scaler=y_train.copy()
y_test_before_scaler=y_test.copy()

In [13]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(31647, 52) (13564, 52) (31647,) (13564,)


In [14]:
y_train=np.asarray(y_train)
y_train=y_train.reshape(-1,1)  # had to reshape as it is 1D and it can't be transformed like that

y_test=np.asarray(y_test)
y_test=y_test.reshape(-1,1)  # had to reshape as it is 1D and it can't be transformed like that

scaler_train = StandardScaler().fit(X_train)
with open(model_and_scaler+'scaler.pickle', 'wb') as handle:
    pickle.dump(scaler_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
# scaler_y = StandardScaler().fit(y_train)

X_train = scaler_train.transform(X_train)
# y_train = scaler_y.transform(y_train)

X_test = scaler_train.transform(X_test)
# y_test = scaler_y.transform(y_test)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(X_train, y_train)

pred_y_test=neigh.predict(X_test)
pred_y_train=neigh.predict(X_train)

print(accuracy_score(y_train, pred_y_train)) 
print(accuracy_score(y_test, pred_y_test)) 
print(confusion_matrix(y_test, pred_y_test))

  return self._fit(X, y)


0.9106392391063924
0.8950899439693306
[[11651   314]
 [ 1109   490]]


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
RF = RandomForestClassifier(n_estimators=2000)
RF.fit(X_train, y_train)

with open(model_and_scaler+'RF_model.pickle', 'wb') as handle:
    pickle.dump(RF, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
pred_y_test=RF.predict(X_test)
pred_y_train=RF.predict(X_train)

print(accuracy_score(y_train, pred_y_train)) 
print(accuracy_score(y_test, pred_y_test)) 
print(confusion_matrix(y_test, pred_y_test))

  RF.fit(X_train, y_train)


1.0
0.9143320554408729
[[11634   331]
 [  831   768]]


In [17]:
train_pred=X_train_before_scaler.copy()
train_pred['predicted']=pred_y_train
test_pred=X_test_before_scaler.copy()
test_pred['predicted']=pred_y_test

train_pred.to_csv(ML_results+'train_pred.csv')
test_pred.to_csv(ML_results+'test_pred.csv')

In [25]:
# compress model file for GIT upload
model_open=pd.read_pickle(model_and_scaler+'RF_model.pickle')
import bz2file as bz2
import pickle
with bz2.BZ2File(model_and_scaler+ 'RF_model_compressed.pbz2', 'w') as f:
    pickle.dump(model_open, f)
        

In [26]:
#decompress the model file
ml_mod = bz2.BZ2File(model_and_scaler+ 'RF_model_compressed.pbz2', 'rb')
ml_mod = pickle.load(ml_mod)
print(ml_mod)

RandomForestClassifier(n_estimators=2000)
