In [72]:
##Import Dataset
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 130)

df_unclean = pd.read_csv("bank-full.csv", sep = ";")
df_unclean.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [35]:
##Clean Dataset

In [36]:
#Initially there were no missing values

df_unclean.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [37]:
#Replace "unknown" values as missing

df_unclean.replace('unknown', np.NaN, inplace=True)

In [38]:
#Now check counts of missing values
#We decided to drop contact and poutcome from our analysis

df_unclean.isna().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
y                0
dtype: int64

In [39]:
#We identify the missing rows from the variable job
#these rows were dropped from the entire dataset
####Drops all missing rows from job
df_clean=df_unclean[df_unclean['job'].notnull()]

In [40]:
#We identify the mode of the education variable
df_clean['education'].mode()

0    secondary
dtype: object

In [41]:
#We impute the missing values of education with its mode value
# fill missing values with mean column values
df_clean['education'].replace(to_replace = np.nan, value= 'secondary', inplace=True)

In [42]:
#Check for NA values, education has no missing values
df_clean['education'].value_counts()

secondary    24861
tertiary     13262
primary       6800
Name: education, dtype: int64

In [100]:
##pday has a column called -1 which also represents missing values
#Since the number of NA's is large, we also decided to drop pdays
#from our model
df_clean['pdays'].value_counts().head(3)

-1      36699
 182      165
 92       146
Name: pdays, dtype: int64

In [44]:
#Check to make sure we have no missing values for columns of interest
df_clean.isna().sum().

age              0
job              0
marital          0
education        0
default          0
balance          0
housing          0
loan             0
contact      12909
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36704
y                0
dtype: int64

In [45]:
#Drop contact and poutcome variables
df_clean.drop(df_clean[['contact', 'poutcome']], axis=1, inplace=True)

In [46]:
#Check to see percent of target column that is yes and no
#This will be used as a baseline comparison for model performance
df_clean['y'].value_counts(normalize=True)

no     0.883022
yes    0.116978
Name: y, dtype: float64

In [99]:
#Look at cleaned data, get ready to select dependent and independent variables
df_clean.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,y
0,58,management,married,tertiary,no,2143,yes,no,5,may,261,1,-1,0,no
1,44,technician,single,secondary,no,29,yes,no,5,may,151,1,-1,0,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,76,1,-1,0,no


In [48]:
#Get one hot encoded values for independent categorical variables
#merge one hot encoded variables with independent numeric variables

X_enc = pd.get_dummies(df_clean[['marital', 'job', 'education', 'default', 'housing', 'loan', 'y']], drop_first= True)

numeric_data = df_clean[['age', 'balance', 'duration', 'campaign', 'previous']]
    
cleaned_data_merge = pd.merge(numeric_data, X_enc, how = 'left', left_index=True, right_index=True) 

In [49]:
####Omitted Categories for one hot encoded variables
#Marital - Divorced
#Jobs - admin
#Education - primary
# default - no
#housing - no
#loan - no

In [50]:
#look at data
cleaned_data_merge

Unnamed: 0,age,balance,duration,campaign,previous,marital_married,marital_single,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,education_secondary,education_tertiary,default_yes,housing_yes,loan_yes,y_yes
0,58,2143,261,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0
1,44,29,151,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0
2,33,2,76,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0
3,47,1506,92,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
5,35,231,139,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0
6,28,447,217,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0
7,42,2,380,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0
8,58,121,50,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
9,43,593,55,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0
10,41,270,222,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0


In [51]:
#Split data into dependent and independent variables

cleaned_data_x = cleaned_data_merge.iloc[:,:-1]
cleaned_data_x
cleaned_data_y = cleaned_data_merge.iloc [:,-1:]
cleaned_data_x.head(2)

Unnamed: 0,age,balance,duration,campaign,previous,marital_married,marital_single,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,education_secondary,education_tertiary,default_yes,housing_yes,loan_yes
0,58,2143,261,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,44,29,151,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0


In [52]:
#Split data into dependent and independent variables

cleaned_data_x = cleaned_data_merge.iloc[:,:-1]
cleaned_data_y = cleaned_data_merge.iloc [:,-1:]

In [53]:
#Check independent variables
cleaned_data_x.head(2)

Unnamed: 0,age,balance,duration,campaign,previous,marital_married,marital_single,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,education_secondary,education_tertiary,default_yes,housing_yes,loan_yes
0,58,2143,261,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,44,29,151,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0


In [54]:
#Check dependent variables
cleaned_data_y.head(2)

Unnamed: 0,y_yes
0,0
1,0


In [55]:
#Check missing values for independent variables
cleaned_data_x.isna().sum()

age                    0
balance                0
duration               0
campaign               0
previous               0
marital_married        0
marital_single         0
job_blue-collar        0
job_entrepreneur       0
job_housemaid          0
job_management         0
job_retired            0
job_self-employed      0
job_services           0
job_student            0
job_technician         0
job_unemployed         0
education_secondary    0
education_tertiary     0
default_yes            0
housing_yes            0
loan_yes               0
dtype: int64

In [56]:
#Split Data into training and testing groups
#We will have to standardize the numeric independent variables
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(cleaned_data_x, cleaned_data_y, test_size=0.1, random_state= 1254)

In [57]:
#Isolate training numeric independent variables
x_train_numeric = x_train.iloc[:,:5]

In [58]:
#Isolate tersting numeric independent variables
x_test_numeric = x_test.iloc[:,:5]

In [59]:
#Check we have all numeric independent variables, we do
x_test_numeric.head(2)

Unnamed: 0,age,balance,duration,campaign,previous
32462,32,932,944,1,1
34147,33,700,169,1,2


In [60]:
#Apply standard scalar
import sklearn
from sklearn.preprocessing import StandardScaler

scalar = sklearn.preprocessing.StandardScaler()

scalar_x_train_numeric = pd.DataFrame(scalar.fit_transform(x_train_numeric), index =x_train_numeric.index, columns=x_train_numeric.columns[0:5], )
scalar_x_test_numeric = pd.DataFrame(scalar.transform(x_test_numeric), index = x_test_numeric.index, columns=x_test_numeric.columns[0:5])

In [61]:
#Merge standardized numerical independent variables with categorical independent variables
x_train_complete = pd.merge(scalar_x_train_numeric, x_train.iloc[:,5:], how = 'left', left_index=True, right_index=True)
x_test_complete = pd.merge(scalar_x_test_numeric, x_test.iloc[:,5:], how = 'left', left_index=True, right_index=True)

In [98]:
#Check x training data
x_train_complete.head(3)

Unnamed: 0,age,balance,duration,campaign,previous,marital_married,marital_single,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,education_secondary,education_tertiary,default_yes,housing_yes,loan_yes
29108,0.481107,-0.255399,-0.281311,-0.566541,-0.248662,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
12733,-0.27239,0.279103,-0.664779,-0.245122,-0.248662,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
44833,-0.743326,-0.156551,-0.951412,-0.566541,2.31293,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [63]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(x_train_complete, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [64]:
pca_3 = PCA(n_components=16)

In [73]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

scores = cross_val_score(clf, x_train_complete, y_train, cv=10, scoring='accuracy')
print('Cross Validaiton Scores', scores)
print('Cross Validation Mean:', sum(scores)/10)

Cross Validaiton Scores [0.88031652 0.88773492 0.88476756 0.88572842 0.89067524 0.88251299
 0.88127628 0.88718456 0.88644236 0.88297872]
Cross Validation Mean: 0.8849617557914835


In [74]:
y_pred = clf.predict(x_test_complete)

In [75]:
y_pred.shape

(4493,)

In [76]:
confusion_matrix(y_test, y_pred)

array([[3829,  166],
       [ 355,  143]])

In [70]:
#Data has been prepped
#Begin fitting regression model
import statsmodels.api as sm
import sklearn.linear_model 
from sklearn.linear_model import LogisticRegression
#create an instance and fit the model 
logmodel = LogisticRegression()
sk_res = logmodel.fit(x_train_complete, y_train)
sk_predictions = logmodel.predict(x_test_complete)

  y = column_or_1d(y, warn=True)


In [77]:
scores = cross_val_score(sk_res, x_train_complete, y_train, cv=10, scoring='accuracy')
print('Cross Validaiton Scores', scores)
print('Cross Validation Mean:', sum(scores)/10)

Cross Validaiton Scores [0.88847676 0.89144411 0.89292779 0.89116992 0.88844917 0.88968588
 0.88647044 0.8913904  0.88644236 0.88990599]
Cross Validation Mean: 0.889636282218118


In [78]:
from sklearn.metrics import classification_report
print(classification_report(y_test, sk_predictions))

             precision    recall  f1-score   support

          0       0.91      0.98      0.94      3995
          1       0.54      0.19      0.28       498

avg / total       0.87      0.89      0.87      4493



In [79]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, sk_predictions))

[[3913   82]
 [ 402   96]]


In [81]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score
print('Accuracy score for Logistic is: '+ str(accuracy_score(y_test, sk_predictions)))

Accuracy score for Logistic is: 0.8922768751391053


In [82]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(np.asarray(x_train_complete), np.asarray(y_train));

In [83]:
from sklearn.ensemble import RandomForestClassifier
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(x_train_complete, y_train)
# predictions
rfc_predict = rfc.predict(x_test_complete)

In [84]:
scores = cross_val_score(rfc, x_train_complete, y_train, cv=10, scoring='accuracy')
print('Cross Validaiton Scores', scores)
print('Cross Validation Mean:', sum(scores)/10)

Cross Validaiton Scores [0.88724036 0.89070227 0.8892186  0.89116992 0.89339599 0.88943854
 0.88399703 0.88668976 0.88718456 0.88124691]
Cross Validation Mean: 0.8880283937670198


In [None]:
#Need this to run xgboost
#!pip install xgboost

In [86]:
import xgboost as xgb

In [87]:
#splitting into test/train sets for XGBoost
D_train = xgb.DMatrix(x_train_complete, label=y_train)
D_test = xgb.DMatrix(x_test_complete, label=y_test)

In [88]:
#Paramaters for XGboost, we are using 200 iterations
param = {
    'eta': 0.3, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 2,
    } 
steps = 200

In [96]:
model = xgb.train(param, D_train, steps)

In [97]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score

preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(y_test, best_preds, average='macro')))
print("Recall = {}".format(recall_score(y_test, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, best_preds)))

Precision = 0.7288084104347927
Recall = 0.6584367507577242
Accuracy = 0.8942799910972624
