In [76]:
import pandas as pd
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split , KFold
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from joblib import dump, load
import argparse


#drive.mount('/content/drive')

In [9]:
#get the file 
#df =  pd.read_csv('bank-additional/bank-additional/bank-additional-full.csv' , sep=";" , engine='python')
df = pd.read_csv('/content/drive/My Drive/bank-additional-full.csv' , sep=";")
df.to_pickle("./bankcsv.pkl")
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,334,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,383,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,189,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,442,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


 The duration skews the analysis as it is not known before a call is made and should therefore be dropped before we feed this data to our model 

In [None]:
df.drop('duration' , axis=1 , inplace=True)
df.info() , df.describe()


From the information , we can see we have no null entries in the dataframe 
Hence , we can go forward and process our data by:
1. Encoding Categorical Data
2. Scale
3. Class Balance in target variable
5. Aggregating and Transformation
6. Dimension reduction





In [1]:
%%writefile data.py

def convert_categorical(dataframe , columns):
   df = dataframe[columns]
   converted_df = pd.get_dummies(df)
   res = pd.concat([df, converted_df], axis=1)
   return res

def get_column_counts(df ):
  """
  Get count of unique items in the dataframe column
  """
  for c in df.columns:
    print(c)
    print(df[c].value_counts())

def scale_columns(df , column):
  """"
  Scale columns using MinMax scaler
  """
  col = df[column]
  scaler = MinMaxScaler() 
  num2 = scaler.fit_transform(col)
  num2 = pd.DataFrame(num2, columns = col.columns)
  return num2

def upsample(df , majority_col, minority_col):
  """"
  Perform upsampling by resampling while replacing
  """
  # we seperate the classes
  majority_class = df[majority_col]
  minority_class = df[minority_col]
  #Upsample minority class
  df_minority_upsampled = resample(minority_class, 
                                 replace=True,     # sample with replacement
                                 n_samples=36548,    # to match majority class
                                 random_state=123) # reproducible results
  # Combine majority class with upsampled minority class
  df_upsampled = pd.concat([majority_class, df_minority_upsampled])
  return df_upsampled

def convert_dates(df , column , format):
  """
  convert date and months to numerical format

  """
  months = []
  day_of_week = []
  #convert column items to strings
  
    #format months
  if format == "%b":
    for index, row in df[column].items():
      datetime_object = datetime.datetime.strptime(row , "%b")
      month_number = datetime_object.month
      months.append(month_number)
    return months
    ## format days
  elif format == "%a":
    for index, row in df[column].items():
      datetime_object = datetime.datetime.strptime(row , "%a")
      day_number = datetime_object.weekday()
      day_of_week.append(day_number)
    return day_of_week


def get_target_sample(df , column , sample_number):
  """
  """
  sample = df[column].sample(n=int(sample_number), random_state=1)
  return sample

def pca_reduction(df , components):
  """
  """
  feat_cols = df.columns.tolist()
  pca_data = PCA(n_components=components).fit(df)
  var_exp = pca_data.explained_variance_ratio_
  #apply the dimensionality reduction
  transformed = pca_data.transform(df)
  
  return transformed


Writing data.py


In [4]:
%%writefile model.py

def logistic_reg(X,y):
  """
  """
  X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.1,random_state=1)
  lr = LogisticRegressionCV(random_state=0).fit(X_train, y_train)
  prediction=lr.predict(X_test)
  #score model
  score = f1_score(y_test, prediction, average='macro')
  #save model
  dump(lr, 'logistic_reg.joblib')
  return (prediction , y_train , score)


def xg_boost(X , y):
  """
  """
  # convert the dataset into a Dmatrix that gives it  performance and efficiency gains.
  data_dmatrix = xgb.DMatrix(data=X,label=y)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
  xg_reg = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
  xg_reg.fit(X_train,y_train)
  preds = xg_reg.predict(X_test)
  params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}
  cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="auc", as_pandas=True, seed=123)
  #score model
  score = f1_score(y_test, preds, average='macro')
  #save model
  dump(xg_reg, 'xg_boost.joblib')
  return (preds , cv_results , score)

def multi_layer_percep(X , y):
  """
  
  """
  X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
  clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
  preds = clf.predict(X_test)
  #score model
  score = f1_score(y_test, preds, average='macro')
  #save model
  dump(clf, 'xg_boost.joblib')
  return  (preds ,y_train , score) 

Overwriting model.py


#### Encode categorical data 
We use the get_dummies() function availed by pandas

0 indicates non existent while 1 indicates existent.


In [13]:
categorical = ['job' , 'marital' , 'education' , 'contact' , 'housing' , 'default' , 'loan' , 'poutcome']
target = ['y' ]
binned = ['pdays']
dates = ['month' , 'day_of_week']

In [20]:
df = pd.read_pickle("./bankcsv.pkl")
converted_df = convert_categorical(df , categorical)
get_column_counts(converted_df)

job
admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64
marital
married     24928
single      11568
divorced     4612
unknown        80
Name: marital, dtype: int64
education
university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: education, dtype: int64
contact
cellular     26144
telephone    15044
Name: contact, dtype: int64
housing
yes        21576
no         18622
unknown      990
Name: housing, dtype: int64
default
no         32588
unknown     8597
yes            3
Name: default, dtype: int64
loan
no         33950
yes         6248
unknown      990
Name: loan, 

In [25]:
month = convert_dates(df , dates[0]  , '%b')
day = convert_dates(df , dates[1] , '%a')


#### Scale
We scale the pdays , employee numbers and age columns column to have a restricted range

In [26]:
scaled = scale_columns(df , binned)
scaled_emp_no = scale_columns(df , ['nr.employed'])
scaled_age = scale_columns(df , ['age'])

#### Class Balancing
Imbalanced classes put “accuracy” out of business. This is a surprisingly common problem in machine learning (specifically in classification), occurring in datasets with a disproportionate ratio of observations in each class as is the case in our dataset with the target variable


In [27]:
#check existence of class imbalance
get_column_counts(df[target])

y
no     36548
yes     4640
Name: y, dtype: int64


With a population of 41,188 , the yes population makes up only 11% of the population
We handle this by upsampling the minority class(the yes)

1. First, we'll separate observations from each class into different DataFrames.
2. Next, we'll resample the minority class with replacement, setting the number of samples to match that of the majority class.
3. Finally, we'll combine the up-sampled minority class DataFrame with the original majority class DataFrame.



In [28]:
#   we start by encoding the content of the target column
target_df = convert_categorical(df , target)

#   use upsample function
upsampled = upsample(target_df ,'y_no' , 'y_yes' )

target_column = pd.DataFrame(upsampled , columns=["deposit"])

#### Transform and Aggregate
*First , let us bring it all together into one dataframe*


In [30]:
#combine the encoded , scaled and datetime formatted columns
## drop the categorical columns
converted_df.drop(categorical , axis=1 , inplace=True)
converted_df

# bring in the campaign columns and the age column
converted_df['age'] = scaled_age
converted_df['campaign'] = df['campaign']
converted_df['previous'] = df['previous']

#bring in the derived month column
converted_df['month'] = month

#bring in the scaled employee numbers and number of days that pass before a customer is contacted
converted_df['pdays'] = scaled
converted_df['employee_no'] = scaled_emp_no


converted_df

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,contact_cellular,contact_telephone,housing_no,housing_unknown,housing_yes,default_no,default_unknown,default_yes,loan_no,loan_unknown,loan_yes,poutcome_failure,poutcome_nonexistent,poutcome_success,age,campaign,previous,month,pdays,employee_no
0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,0.481481,1,0,5,1.0,0.859735
1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0.493827,1,0,5,1.0,0.859735
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,0.246914,1,0,5,1.0,0.859735
3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,0.283951,1,0,5,1.0,0.859735
4,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0.481481,1,0,5,1.0,0.859735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,1,0,0.691358,1,0,11,1.0,0.000000
41184,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0.358025,1,0,11,1.0,0.000000
41185,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,1,0,0.481481,2,0,11,1.0,0.000000
41186,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0.333333,1,0,11,1.0,0.000000


In [31]:
#calculate total number of contacts made with user
sum_column = converted_df['previous'] + converted_df['campaign']
converted_df['total_contacts'] = sum_column

#### Reduce the dimensions
45 columns is a large number ; additionally some of them report the same result
Dimensionality reduction can be done in two different ways:

  - By only keeping the most relevant variables from the original dataset (this technique is called feature selection)
  - By finding a smaller set of new variables, each being a combination of the input variables, containing basically the same information as the input variables (this technique is called dimensionality reduction)


###### Low Variance filtering


In [32]:
variance = converted_df.var()
variance


job_admin.                       0.189013
job_blue-collar                  0.174202
job_entrepreneur                 0.034101
job_housemaid                    0.025074
job_management                   0.065953
job_retired                      0.040017
job_self-employed                0.033311
job_services                     0.087079
job_student                      0.020793
job_technician                   0.136914
job_unemployed                   0.024013
job_unknown                      0.007948
marital_divorced                 0.099439
marital_married                  0.238934
marital_single                   0.201982
marital_unknown                  0.001939
education_basic.4y               0.091111
education_basic.6y               0.052552
education_basic.9y               0.125229
education_high.school            0.177651
education_illiterate             0.000437
education_professional.course    0.111093
education_university.degree      0.208154
education_unknown                0

We can safely drop the day column since it varies never


In [34]:
converted_df.drop('day' , axis=1 , inplace=True)
converted_df.astype('float64').dtypes

job_admin.                       float64
job_blue-collar                  float64
job_entrepreneur                 float64
job_housemaid                    float64
job_management                   float64
job_retired                      float64
job_self-employed                float64
job_services                     float64
job_student                      float64
job_technician                   float64
job_unemployed                   float64
job_unknown                      float64
marital_divorced                 float64
marital_married                  float64
marital_single                   float64
marital_unknown                  float64
education_basic.4y               float64
education_basic.6y               float64
education_basic.9y               float64
education_high.school            float64
education_illiterate             float64
education_professional.course    float64
education_university.degree      float64
education_unknown                float64
contact_cellular

#### Principal Compnent Analysis




In [73]:
transformed_features = pca_reduction(converted_df , 10)
transformed_features

array([[-2.2982189 , -1.7582178 , -0.76625817, ..., -0.33380024,
        -0.16699325, -0.11932755],
       [-2.29289578, -1.80970102, -1.03521419, ...,  0.23420428,
        -0.04701397,  1.12426402],
       [-2.30077016, -1.75303536, -0.43894364, ..., -0.73696697,
        -0.07035683,  0.89450843],
       ...,
       [-1.01942651,  4.42515749, -0.06180675, ...,  0.08633688,
        -0.34456488, -0.16863271],
       [-2.43062395,  4.35906106, -0.40192842, ..., -0.77883432,
        -0.1663836 , -0.41547152],
       [ 1.1016459 ,  4.48507768,  1.15722404, ..., -0.49462268,
        -0.23872907, -0.17834526]])

### Modeling
We use three main model approaches :
  - Logistic regression
  - XGBoost
  - Multi Layer Perceptron
  

In [37]:
X = transformed_features  
y = get_target_sample(target_column , 'deposit' , 41188  )

#### Logistic Regression

In [45]:
pred, true , score = logistic_reg(X , y)
#get evaluation of mode
score

0.3467089611419508

#### XGBoost

In [56]:
pred , true , score = xg_boost(X, y)
#get evaluation of model
score
#save model

0.3519975460816509

#### Multi Layer Perceptron

In [64]:
pred , cv_results , score = multi_layer_percep(X , y)
score

0.4965516820634603

In [None]:
!pip freeze > requirements.txt

In [None]:
#make package

In [5]:
%%writefile main.py

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split , KFold
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from joblib import dump, load
import argparse
#custom imports
from data import *
from model import *
def main():
    # Create the parser
    parser = argparse.ArgumentParser()
    parser.add_argument('filename',action='store_true', default= 'bank-additional.csv'  help='Name of the file you want to load')
    args = parser.parse_args()

    result = []
   if args.filename == 'bank-additional.csv':
     df= pd.read_csv(args.filename , sep=";")
     categorical = ['job' , 'marital' , 'education' , 'contact' , 'housing' , 'default' , 'loan' , 'poutcome']
      target = ['y' ]
      binned = ['pdays']
      dates = ['month' , 'day_of_week']
      converted_df = convert_categorical(df , categorical)
      month = convert_dates(df , dates[0]  , '%b')
      day = convert_dates(df , dates[1] , '%a')
      scaled = scale_columns(df , binned)
      scaled_emp_no = scale_columns(df , ['nr.employed'])
      scaled_age = scale_columns(df , ['age'])
      #   we start by encoding the content of the target column
      target_df = convert_categorical(df , target)

      #   use upsample function
      upsampled = upsample(target_df ,'y_no' , 'y_yes' )

      target_column = pd.DataFrame(upsampled , columns=["deposit"])
      converted_df.drop(categorical , axis=1 , inplace=True)
      converted_df

      # bring in the campaign columns and the age column
      converted_df['age'] = scaled_age
      converted_df['campaign'] = df['campaign']
      converted_df['previous'] = df['previous']

      #bring in the derived month column
      converted_df['month'] = month

      #bring in the scaled employee numbers and number of days that pass before a customer is contacted
      converted_df['pdays'] = scaled
      converted_df['employee_no'] = scaled_emp_no

      #calculate total number of contacts made with user
      sum_column = converted_df['previous'] + converted_df['campaign']
      converted_df['total_contacts'] = sum_column

      converted_df.drop('day' , axis=1 , inplace=True)
      converted_df.astype('float64').dtypes

      #transform features
      transformed_features = pca_reduction(converted_df , 10)

      #define target and training features
      X = transformed_features  
      y = get_target_sample(target_column , 'deposit' , 41188  )

      #logistic regression
     log_r= logistic_reg(X , y)

      #gradient boost
      xg = xg_boost(X, y)

      #multilayer perceptron
      percep = multi_layer_percep(X , y)

      return log_r , xg , percep




     

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print('Something went wrong {0}'.format(e))


Writing main.py


# References
http://deeplearning.net/tutorial/mlp.html