In [3]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

data = pd.read_csv("bank-additional-full.csv")

# Replacing missing values
data["marital"].replace("unknown", "married", inplace=True)
data["default"].replace("unknown", "no", inplace=True)
data["loan"].replace("unknown", "no", inplace=True)

# Dropping remaining missing values & outliers
data.drop(data[data.housing == "unknown"].index, inplace=True)
data.drop(data[data.education == "unknown"].index, inplace=True)
data.drop(data[data.job == "unknown"].index, inplace=True)
data.drop(data[data.age >= 70].index, inplace=True)

X = data.drop(columns=['y'])
y =data.y

encoder = LabelEncoder()

X.describe(include="all")


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,37899.0,37899,37899,37899,37899,37899,37899,37899,37899,37899,37899.0,37899.0,37899.0,37899,37899.0,37899.0,37899.0,37899.0,37899.0
unique,,11,3,7,2,2,2,2,10,5,,,,3,,,,,
top,,admin.,married,university.degree,no,yes,no,cellular,may,thu,,,,nonexistent,,,,,
freq,,9943,22974,11787,37896,20376,31991,24120,12793,7943,,,,32879,,,,,
mean,39.458825,,,,,,,,,,2.574158,965.357661,0.164859,,0.108509,93.574986,-40.58519,3.651885,5168.87972
std,9.574816,,,,,,,,,,2.778507,179.656227,0.477472,,1.551501,0.571991,4.566371,1.716182,70.583983
min,17.0,,,,,,,,,,1.0,0.0,0.0,,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,,,,,,,,,,1.0,999.0,0.0,,-1.8,93.075,-42.7,1.354,5099.1
50%,38.0,,,,,,,,,,2.0,999.0,0.0,,1.1,93.444,-41.8,4.857,5191.0
75%,46.0,,,,,,,,,,3.0,999.0,0.0,,1.4,93.994,-36.4,4.961,5228.1


In [5]:
one_enc_X = pd.get_dummies(data.drop(columns=["y"]))
one_enc_X

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0,1,...,0,0,0,1,0,0,0,0,1,0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41181,37,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,1,...,0,0,1,0,0,0,0,0,1,0
41182,29,1,9,1,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,0,1
41184,46,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,1,0
41185,56,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,1,0


In [7]:
# Encoding categorical attributes
label_enc_X = X
label_enc_X.job = encoder.fit_transform(label_enc_X.job)
label_enc_X.marital = encoder.fit_transform(label_enc_X.marital)
label_enc_X.education = encoder.fit_transform(label_enc_X.education)
label_enc_X.default = encoder.fit_transform(label_enc_X.default)
label_enc_X.housing = encoder.fit_transform(label_enc_X.housing)
label_enc_X.loan = encoder.fit_transform(label_enc_X.loan)
label_enc_X.contact = encoder.fit_transform(label_enc_X.contact)
label_enc_X.month = encoder.fit_transform(label_enc_X.month)
label_enc_X.day_of_week = encoder.fit_transform(label_enc_X.day_of_week)
label_enc_X.poutcome = encoder.fit_transform(label_enc_X.poutcome)

label_enc_X

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,3,1,0,0,0,0,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
1,57,7,1,3,0,0,0,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
2,37,7,1,3,0,1,0,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
3,40,0,1,1,0,0,0,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
4,56,7,1,3,0,0,1,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41181,37,0,1,6,0,1,0,0,7,0,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6
41182,29,10,2,0,0,1,0,0,7,0,1,9,1,2,-1.1,94.767,-50.8,1.028,4963.6
41184,46,1,1,5,0,0,0,0,7,0,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6
41185,56,5,1,6,0,1,0,0,7,0,2,999,0,1,-1.1,94.767,-50.8,1.028,4963.6


In [8]:
# Using accuracy
model = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
scores = cross_val_score(model, label_enc_X, y, cv=10, n_jobs=-1, error_score='raise')

scores.mean()

0.7325691924477953

In [9]:
# Using accuracy 
one_enc_model = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
scores = cross_val_score(one_enc_model, one_enc_X, y, cv=10, n_jobs=-1, error_score='raise')

scores.mean()

0.7600890649296568

In [9]:
# Binning age according to Erikson's stages of psychosocial development
# 12-19 -> Adolescent
# 20-39 -> Early Adult
# 40-59 -> Middle Adult
# >= 60 -> Late Adult

ages = []
for age in X.age:
    if age <= 19:
        ages.append("adolescent")
    elif 20 <= age <= 39:
        ages.append("early adult")
    elif 40 <= age <= 59:
        ages.append("middle adult")
    elif age >= 60:
        ages.append("late adult")

data.age = ages

data.to_csv("bank-additional-binned.csv")

In [5]:
# Using accuracy
age_binned_model = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
scores = cross_val_score(age_binned_model, X, y, cv=10, n_jobs=-1, error_score='raise')

scores.mean()

0.7007776658022661

In [10]:
# Using RMSE
model_rms = KNeighborsClassifier(n_jobs=-1)

scores_rms = cross_val_score(model_rms, label_enc_X, y, scoring='neg_brier_score') # In Classification, RMSE is known as Brier's score

scores_rms.mean()

-0.368422390851544

In [11]:
# Using RMSE for One Hot Encoded
one_model_rms = KNeighborsClassifier(n_jobs=-1)

scores_rms = cross_val_score(one_model_rms, one_enc_X, y, n_jobs=-1, scoring='neg_brier_score') # In Classification, RMSE is known as Brier's score

scores_rms.mean()

-0.3629285245545513

In [12]:
# Using ROC
model_roc = KNeighborsClassifier(n_jobs=-1)

scores_roc = cross_val_score(model_roc, label_enc_X,y, scoring='roc_auc')

scores_roc.mean()

0.36360609252875664

In [13]:
# Using ROC one hot encoded
one_model_roc = KNeighborsClassifier(n_jobs=-1)

scores_roc = cross_val_score(one_model_roc, one_enc_X,y, n_jobs=-1, scoring='roc_auc')

scores_roc.mean()

0.35946382420957673

In [4]:
one_enc_X

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0,1,...,0,0,0,1,0,0,0,0,1,0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,1,0
41184,46,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,1,0
41185,56,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,1,0
41186,44,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,1,0
