In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import BayesianGaussianMixture

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [2]:
# Load the dataset
file_path = '/Users/russelwilson/Desktop/bank-full.csv'
df = pd.read_csv(file_path, delimiter=';')

df_original = pd.read_csv(file_path, delimiter=';')

In [3]:
categorical_columns = df.select_dtypes(include=['object']).columns

In [4]:
unique_values_info = {}
for col in categorical_columns:
    unique_counts = df[col].value_counts()
    unique_values_info[col] = unique_counts

In [5]:
unique_values_info

{'job': blue-collar      9732
 management       9458
 technician       7597
 admin.           5171
 services         4154
 retired          2264
 self-employed    1579
 entrepreneur     1487
 unemployed       1303
 housemaid        1240
 student           938
 unknown           288
 Name: job, dtype: int64,
 'marital': married     27214
 single      12790
 divorced     5207
 Name: marital, dtype: int64,
 'education': secondary    23202
 tertiary     13301
 primary       6851
 unknown       1857
 Name: education, dtype: int64,
 'default': no     44396
 yes      815
 Name: default, dtype: int64,
 'housing': yes    25130
 no     20081
 Name: housing, dtype: int64,
 'loan': no     37967
 yes     7244
 Name: loan, dtype: int64,
 'contact': cellular     29285
 unknown      13020
 telephone     2906
 Name: contact, dtype: int64,
 'month': may    13766
 jul     6895
 aug     6247
 jun     5341
 nov     3970
 apr     2932
 feb     2649
 jan     1403
 oct      738
 sep      579
 mar      477
 de

In [6]:
# Label encoding for 'education'
education_mapping = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
df['education_encoded'] = df['education'].map(education_mapping)

In [7]:
df[['education', 'education_encoded']].head()

Unnamed: 0,education,education_encoded
0,tertiary,3
1,secondary,2
2,secondary,2
3,unknown,0
4,unknown,0


In [8]:
# Label encoding for 'default', 'housing', 'loan', 'y' 
binary_mapping = {'no': 0, 'yes': 1}
columns_to_encode = ['default', 'housing', 'loan', 'y']

for col in columns_to_encode:
    df[col + '_encoded'] = df[col].map(binary_mapping)

In [9]:
df[['default', 'default_encoded', 'housing', 'housing_encoded', 'loan', 'loan_encoded', 'y', 'y_encoded']].head()

Unnamed: 0,default,default_encoded,housing,housing_encoded,loan,loan_encoded,y,y_encoded
0,no,0,yes,1,no,0,no,0
1,no,0,yes,1,no,0,no,0
2,no,0,yes,1,yes,1,no,0
3,no,0,yes,1,no,0,no,0
4,no,0,no,0,no,0,no,0


In [10]:
df['poutcome'] = df['poutcome'].replace(['unknown', 'other'], 'others')

# Label encoding for 'poutcome' with the specified mapping
poutcome_mapping = {'failure': 0, 'success': 1, 'others': -1}
df['poutcome_encoded'] = df['poutcome'].map(poutcome_mapping)

In [11]:
df[['poutcome', 'poutcome_encoded']].head()

Unnamed: 0,poutcome,poutcome_encoded
0,others,-1
1,others,-1
2,others,-1
3,others,-1
4,others,-1


In [12]:
# Frequency encoding for 'job' and 'month'
job_freq = df['job'].value_counts(normalize=True)
month_freq = df['month'].value_counts(normalize=True)
contact_freq = df['contact'].value_counts(normalize=True)
marital_freq = df['marital'].value_counts(normalize=True)

df['job_encoded'] = df['job'].map(job_freq)
df['month_encoded'] = df['month'].map(month_freq)
df['contact_encoded'] = df['contact'].map(contact_freq)
df['marital_encoded'] = df['marital'].map(marital_freq)

In [13]:
df[['job', 'job_encoded', 'month', 'month_encoded']].head()

Unnamed: 0,job,job_encoded,month,month_encoded
0,management,0.209197,may,0.304483
1,technician,0.168034,may,0.304483
2,entrepreneur,0.03289,may,0.304483
3,blue-collar,0.215257,may,0.304483
4,unknown,0.00637,may,0.304483


In [14]:
df[['contact', 'contact_encoded', 'marital', 'marital_encoded']].head()

Unnamed: 0,contact,contact_encoded,marital,marital_encoded
0,unknown,0.287983,married,0.601933
1,unknown,0.287983,single,0.282896
2,unknown,0.287983,married,0.601933
3,unknown,0.287983,married,0.601933
4,unknown,0.287983,single,0.282896


In [15]:
encoded_columns = [
    'education_encoded', 'default_encoded', 'housing_encoded', 'loan_encoded', 
    'poutcome_encoded', 'job_encoded', 'month_encoded', 
    'contact_encoded', 'marital_encoded', 'y_encoded'
]

numeric_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

df_encoded = df[numeric_columns + encoded_columns]

df_encoded.columns.tolist()

['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'education_encoded',
 'default_encoded',
 'housing_encoded',
 'loan_encoded',
 'poutcome_encoded',
 'job_encoded',
 'month_encoded',
 'contact_encoded',
 'marital_encoded',
 'y_encoded']

In [16]:
df_encoded.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,education_encoded,default_encoded,housing_encoded,loan_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded,y_encoded
0,58,2143,5,261,1,-1,0,3,0,1,0,-1,0.209197,0.304483,0.287983,0.601933,0
1,44,29,5,151,1,-1,0,2,0,1,0,-1,0.168034,0.304483,0.287983,0.282896,0
2,33,2,5,76,1,-1,0,2,0,1,1,-1,0.03289,0.304483,0.287983,0.601933,0
3,47,1506,5,92,1,-1,0,0,0,1,0,-1,0.215257,0.304483,0.287983,0.601933,0
4,33,1,5,198,1,-1,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896,0


In [17]:
df_encoded.shape

(45211, 17)

In [18]:
df_original.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [19]:
df_original.shape

(45211, 17)

In [20]:
# df_temp = df[encoded_columns]
# df_temp.head()

Unnamed: 0,education_encoded,default_encoded,housing_encoded,loan_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded,y_encoded
0,3,0,1,0,-1,0.209197,0.304483,0.287983,0.601933,0
1,2,0,1,0,-1,0.168034,0.304483,0.287983,0.282896,0
2,2,0,1,1,-1,0.03289,0.304483,0.287983,0.601933,0
3,0,0,1,0,-1,0.215257,0.304483,0.287983,0.601933,0
4,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896,0


In [21]:
freq_encoded_columns = ['job_encoded', 'month_encoded', 'contact_encoded', 'marital_encoded']

x_out = df_encoded[numeric_columns]

x_out.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0


In [22]:
# Applying IQR
Q1 = x_out.quantile(0.25)
Q3 = x_out.quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((x_out < (Q1 - 1.5 * IQR)) | (x_out > (Q3 + 1.5 * IQR))).any(axis=1)

# Applying Isolation Forest
iso_forest = IsolationForest(random_state=527)
outliers_iso_forest = iso_forest.fit_predict(x_out) == -1

# Applying Local Outlier Factor
lof = LocalOutlierFactor()
outliers_lof = lof.fit_predict(x_out) == -1

# Counting the number of outliers
outliers_count = {
    "IQR": np.sum(outliers_iqr),
    "Isolation Forest": np.sum(outliers_iso_forest),
    "Local Outlier Factor": np.sum(outliers_lof)
}

In [23]:
outliers_count

{'IQR': 17018, 'Isolation Forest': 5387, 'Local Outlier Factor': 807}

In [24]:
# Original encoded dataset
df_encoded_original = df_encoded.copy()

# Encoded dataset without IQR detected outliers
df_encoded_without_iqr_outliers = df_encoded[~outliers_iqr]

# Encoded dataset without Isolation Forest detected outliers
df_encoded_without_iso_forest_outliers = df_encoded[~outliers_iso_forest]

# Encoded dataset without LOF detected outliers
df_encoded_without_lof_outliers = df_encoded[~outliers_lof]

datasets_shapes = {
    "Original Encoded Dataset": df_encoded_original.shape,
    "Without IQR Outliers": df_encoded_without_iqr_outliers.shape,
    "Without Isolation Forest Outliers": df_encoded_without_iso_forest_outliers.shape,
    "Without LOF Outliers": df_encoded_without_lof_outliers.shape
}

In [25]:
datasets_shapes

{'Original Encoded Dataset': (45211, 17),
 'Without IQR Outliers': (28193, 17),
 'Without Isolation Forest Outliers': (39824, 17),
 'Without LOF Outliers': (44404, 17)}

In [35]:
# orginial encoded data
X = df_encoded_original.drop('y_encoded', axis=1)
y = df_encoded_original['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [27]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8406812149808316

In [28]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9076968445886169

In [29]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.8627248599233265

In [30]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

sgd_rbf_accuracy

0.8836626363904453

In [91]:
# data without isolation forest
X = df_encoded_without_iso_forest_outliers.drop('y_encoded', axis=1)
y = df_encoded_without_iso_forest_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [92]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8654168061600268

In [93]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9208235687981252

In [94]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.6257951121526615

In [95]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

sgd_rbf_accuracy

0.9066789420823569

In [96]:
# data without lof
X = df_encoded_without_lof_outliers.drop('y_encoded', axis=1)
y = df_encoded_without_lof_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [97]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8467197117549917

In [98]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9092478606815794

In [99]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.8280288245008257

In [100]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

sgd_rbf_accuracy

0.8859780813691638