<a href="https://colab.research.google.com/github/ShawnLiu119/CapstoneProject_BankingMarketing/blob/main/MobileApp/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import imblearn

In [2]:
url = 'https://raw.githubusercontent.com/ShawnLiu119/CapstoneProject_BankingMarketing/main/rawdata/bank-additional-full.csv'
data = pd.read_csv(url, sep=';')

### Data Transformation - Feature Engineer

In [3]:
data['education'].value_counts()

data_1 = data.copy()
data_1 = data_1[data_1['education'] != 'illiterate']

data_1['edu_new'] = data_1['education'].map({'basic.4y': 'pre_high', 
                                         'basic.9y': 'pre_high',
                                         'basic.6y': 'pre_high',
                                         'university.degree' : 'post_high',
                                         'professional.course':'post_high',
                                         'high.school' : 'high',
                                         'unknown' :'unknown' })

In [4]:
def map_job(row):
    if row['job'] in ['admin.', 'management', 'entrepreneur','self-employed']:
        cat = 'white_collar'
    elif row['job'] in ['housemaid', 'blue-collar', 'technician', 'services']:
        cat = 'blue_collar'
    elif row['job'] in ['student', 'retired', 'unemployed']:
        cat = 'not_working'
    else:
        cat = 'unknown'
    return cat

data_1['job_new'] = data_1.apply(map_job, axis=1)
data_1['job_new'].value_counts(dropna = False)

blue_collar     21017
white_collar    16217
not_working      3606
unknown           330
Name: job_new, dtype: int64

In [5]:
def map_month(row):
    if row['month'] in ['mar', 'apr', 'may']:
        cat = 'spring'
    elif row['month'] in ['jun', 'jul', 'aug']:
        cat = 'summer'
    elif row['month'] in ['sep', 'oct', 'nov']:
        cat = 'fall'
    else:
        cat = 'winter'
    return cat

data_1['season'] = data_1.apply(map_month, axis=1)
data_1['season'].value_counts(dropna = False)

summer    18660
spring    16942
fall       5386
winter      182
Name: season, dtype: int64

In [6]:
data_1.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'edu_new', 'job_new',
       'season'],
      dtype='object')

In [7]:
#This transformed dataframe is based on group discussion on 7/23
# - keep all numerical features as they are
# - consolidate categorical features (education, job, season) and drop originial ones

df_tran_1 = data_1[['age', 'marital', 'default', 'housing', 'loan',
       'contact', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'edu_new', 'job_new', 'season']]

In [8]:
df_tran_1.shape
#this is because we removed illiterate

(41170, 21)

In [9]:
df_tran_1 = df_tran_1.drop(['duration', 'contact', 'day_of_week', 'season'], axis=1)
df_tran_1.columns

Index(['age', 'marital', 'default', 'housing', 'loan', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'edu_new', 'job_new'],
      dtype='object')

In [10]:
df_tran_1['campaign'].value_counts()

1     17634
2     10567
3      5337
4      2649
5      1599
6       979
7       628
8       400
9       283
10      225
11      177
12      125
13       92
14       69
17       58
16       51
15       51
18       33
20       30
19       26
21       24
22       17
23       16
24       15
27       11
29       10
28        8
26        8
25        8
31        7
30        7
35        5
32        4
33        4
34        3
42        2
40        2
43        2
56        1
39        1
41        1
37        1
Name: campaign, dtype: int64

### Ordinal Encoding For Categorical Features

In [11]:
df_tran_1.head()

Unnamed: 0,age,marital,default,housing,loan,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,edu_new,job_new
0,56,married,no,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,pre_high,blue_collar
1,57,married,unknown,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,high,blue_collar
2,37,married,no,yes,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,high,blue_collar
3,40,married,no,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,pre_high,white_collar
4,56,married,no,no,yes,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,high,blue_collar


In [12]:
y = df_tran_1['y'].apply(lambda x: 0 if x=='no' else (1 if x=='yes' else -1))
y.value_counts()

0    36534
1     4636
Name: y, dtype: int64

In [13]:
x = df_tran_1.drop('y', axis=1)

In [14]:
x

Unnamed: 0,age,marital,default,housing,loan,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,edu_new,job_new
0,56,married,no,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,pre_high,blue_collar
1,57,married,unknown,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,high,blue_collar
2,37,married,no,yes,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,high,blue_collar
3,40,married,no,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,pre_high,white_collar
4,56,married,no,no,yes,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,high,blue_collar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,married,no,yes,no,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,post_high,not_working
41184,46,married,no,no,no,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,post_high,blue_collar
41185,56,married,no,yes,no,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,post_high,not_working
41186,44,married,no,no,no,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,post_high,blue_collar


In [15]:
x['poutcome'].value_counts()

nonexistent    35547
failure         4251
success         1372
Name: poutcome, dtype: int64

In [16]:
#since the neural network does work very well with sparse matrix, one hot encoding might be suitable, here i will test ordinal encoding

x['housing'] = x['housing'].apply(lambda t: 0 if t =='no' else (1 if t=='yes' else -1))
x['default'] = x['default'].apply(lambda t: 0 if t =='no' else (1 if t=='yes' else -1))

#higher weight is assinged to success result of previous campaign
x['poutcome'] = x['poutcome'].apply(lambda t: 0 if t=='failure' else (2 if t=='success' else 1))


x['loan'] = x['loan'].apply(lambda t: 0 if t=='no' else (1 if t=='yes' else -1))

x['marital'] = x['marital'].apply(lambda t: 1 if t=='married' else 0)

x['edu_new'] = x['edu_new'].apply(lambda t: 3 if t=='post_high' else (2 if t=='high' else (1 if t=='pre_high' else 0)))
x['job_new'] = x['job_new'].apply(lambda t: 3 if t=='blue_collar' else (2 if t=='white_collar' else (1 if t=='not_working' else 0)))

x.head()

Unnamed: 0,age,marital,default,housing,loan,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,edu_new,job_new
0,56,1,0,0,0,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,1,3
1,57,1,-1,0,0,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,2,3
2,37,1,0,1,0,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,2,3
3,40,1,0,0,0,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,1,2
4,56,1,0,0,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,2,3


### Train-Test Split & Data Normalization

In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_mm = scaler.fit_transform(x)

In [18]:
#here i used stratified sampling approach, since the raw data is not balanced distributed
x_train,x_test,y_train,y_test = train_test_split(x_mm, y, test_size = 0.2, stratify=y, random_state=42)
print(x_train.shape,
      x_test.shape,
      y_train.shape,
      y_test.shape)

(32936, 16) (8234, 16) (32936,) (8234,)


### Balancing the data using SMOTE

https://towardsdatascience.com/all-about-imbalanced-machine-learning-classifiers-60563014d2b3


In [19]:
import imblearn

smote = SMOTE()

x_s, y_s = smote.fit_resample(x_train, y_train)

print('Original dataset shape', len(x_train))
print('Resampled dataset shape', len(x_s))

Original dataset shape 32936
Resampled dataset shape 58454


In [20]:
lg = LogisticRegression(solver='lbfgs', random_state = 100)
lg.fit(x_s, y_s)

LogisticRegression(random_state=100)

In [27]:
preds_lg = lg.predict_proba(x_test)
print('the possibility of this customer is likely to subscribe term deposit is {}'.format(preds_lg))

the possibility of this customer is likely to subscribe term deposit is [[0.69712554 0.30287446]
 [0.66892029 0.33107971]
 [0.26777266 0.73222734]
 ...
 [0.6111805  0.3888195 ]
 [0.61539964 0.38460036]
 [0.719772   0.280228  ]]


In [26]:
preds_lg1 = lg.predict(x_test)
preds_lg1

array([0, 0, 1, ..., 0, 0, 0])

In [22]:
#save the model in pickle format
import pickle 
pickle.dump(lg,open('model.pkl','wb'))

In [23]:
from google.colab import files
files.download('model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>