In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import BayesianGaussianMixture

ModuleNotFoundError: No module named 'pycaret'

In [2]:
# Load the dataset
filename = "bank-full.csv"
df = pd.read_csv(filename, delimiter=';')
df_original = pd.read_csv(filename, delimiter=';')
print(df.head())
df.describe(include='all')

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,,,unknown,no
freq,,9732,27214,23202,44396,,25130,37967,29285,,13766,,,,,36959,39922
mean,40.93621,,,,,1362.272058,,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
std,10.618762,,,,,3044.765829,,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
min,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
25%,33.0,,,,,72.0,,,,8.0,,103.0,1.0,-1.0,0.0,,
50%,39.0,,,,,448.0,,,,16.0,,180.0,2.0,-1.0,0.0,,
75%,48.0,,,,,1428.0,,,,21.0,,319.0,3.0,-1.0,0.0,,


In [3]:
# Data Cleaning
missing_values = df.isnull().sum()

unknown_values = (df == 'unknown').sum()

# nan_values = (df == 'NaN').sum()

duplicates = df.duplicated().sum()

# print(missing_values)
# print(unknown_values)
# print(nan_values)
# print(duplicates)

In [4]:
categorical_columns = df.select_dtypes(include=['object']).columns

In [5]:
unique_values_info = {}
for col in categorical_columns:
    unique_counts = df[col].value_counts()
    unique_values_info[col] = unique_counts

In [6]:
# unique_values_info

In [7]:
df_uni_encoding = pd.DataFrame()

In [8]:
# Label encoding for 'education'
education_mapping = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
df_uni_encoding['education_encoded'] = df['education'].map(education_mapping)

In [9]:
# Label encoding for 'default', 'housing', 'loan', 'y' 
binary_mapping = {'no': 0, 'yes': 1}
columns_to_encode = ['default', 'housing', 'loan', 'y']

for col in columns_to_encode:
    df_uni_encoding[col + '_encoded'] = df[col].map(binary_mapping)

In [10]:
df['poutcome'] = df['poutcome'].replace(['unknown', 'other'], 'others')

# Label encoding for 'poutcome' with the specified mapping
poutcome_mapping = {'failure': 0, 'success': 1, 'others': -1}
df_uni_encoding['poutcome_encoded'] = df['poutcome'].map(poutcome_mapping)

In [11]:
# Frequency encoding for 'job' and 'month'
job_freq = df['job'].value_counts(normalize=True)
month_freq = df['month'].value_counts(normalize=True)
contact_freq = df['contact'].value_counts(normalize=True)
marital_freq = df['marital'].value_counts(normalize=True)

df_uni_encoding['job_encoded'] = df['job'].map(job_freq)
df_uni_encoding['month_encoded'] = df['month'].map(month_freq)
df_uni_encoding['contact_encoded'] = df['contact'].map(contact_freq)
df_uni_encoding['marital_encoded'] = df['marital'].map(marital_freq)

In [12]:
df_uni_encoding.head()

Unnamed: 0,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [13]:
encoded_columns = [
    'education_encoded', 'default_encoded', 'housing_encoded', 'loan_encoded', 
    'poutcome_encoded', 'job_encoded', 'month_encoded', 
    'contact_encoded', 'marital_encoded', 'y_encoded'
]

numeric_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

df_numer = df[numeric_columns]

df_numer.columns.tolist()

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [14]:
df_uni = pd.concat([df_numer, df_uni_encoding], axis = 1)

In [15]:
df_uni.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,58,2143,5,261,1,-1,0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,44,29,5,151,1,-1,0,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,33,2,5,76,1,-1,0,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,47,1506,5,92,1,-1,0,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,33,1,5,198,1,-1,0,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [16]:
df_uni.shape

(45211, 17)

In [17]:
x_out = df_uni[numeric_columns]

x_out.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0


In [18]:
# Applying IQR
Q1 = x_out.quantile(0.25)
Q3 = x_out.quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((x_out < (Q1 - 1.5 * IQR)) | (x_out > (Q3 + 1.5 * IQR))).any(axis=1)

# Applying Isolation Forest
iso_forest = IsolationForest(random_state=527)
outliers_iso_forest = iso_forest.fit_predict(x_out) == -1

# Applying Local Outlier Factor
lof = LocalOutlierFactor()
outliers_lof = lof.fit_predict(x_out) == -1

# Counting the number of outliers
outliers_count = {
    "IQR": np.sum(outliers_iqr),
    "Isolation Forest": np.sum(outliers_iso_forest),
    "Local Outlier Factor": np.sum(outliers_lof)
}

In [19]:
outliers_count

{'IQR': 17018, 'Isolation Forest': 5387, 'Local Outlier Factor': 807}

In [20]:
# Original encoded dataset
df_uni_ori = df_uni.copy()

# Encoded dataset without IQR detected outliers
df_uni_without_iqr_outliers = df_uni[~outliers_iqr]

# Encoded dataset without Isolation Forest detected outliers
df_uni_without_iso_forest_outliers = df_uni[~outliers_iso_forest]

# Encoded dataset without LOF detected outliers
df_uni_without_lof_outliers = df_uni[~outliers_lof]

datasets_shapes = {
    "Original Encoded Dataset": df_uni_ori.shape,
    "Without IQR Outliers": df_uni_without_iqr_outliers.shape,
    "Without Isolation Forest Outliers": df_uni_without_iso_forest_outliers.shape,
    "Without LOF Outliers": df_uni_without_lof_outliers.shape
}

In [21]:
datasets_shapes

{'Original Encoded Dataset': (45211, 17),
 'Without IQR Outliers': (28193, 17),
 'Without Isolation Forest Outliers': (39824, 17),
 'Without LOF Outliers': (44404, 17)}

In [28]:
def score_calculation(pd, te):

    if len(pd) != len(te):
        
        accuracy_score = "Lengths Error"
        
    else:

        matches = sum([1 for pd,te in zip(pd, te) if pd == te])

    accuracy_score = matches / len(pd)
    
    return accuracy_score

In [29]:
# uni encoded data
X = df_uni_ori.drop('y_encoded', axis=1)
y = df_uni_ori['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [None]:
svm = SVC(kernel='linear', probability=True) 

svm.fit(X_train, y_train)

svm_accuracy = svm.score(X_test, y_test)

svm_accuracy

In [32]:
lr = LogisticRegression(max_iter=1000)  

lr.fit(X_train, y_train)

lr_accuracy = lr.score(X_test, y_test)

lr_accuracy

0.8973016809200826

In [33]:
rf = RandomForestClassifier(random_state=42)
    
rf.fit(X_train, y_train)

rf_accuracy = rf.score(X_test, y_test)

rf_accuracy 

0.9070333235033913

In [34]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8406812149808316

In [35]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9076968445886169

In [36]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.8627248599233265

In [37]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

sgd_rbf_accuracy

0.8836626363904453

In [38]:
# uni data without isolation forest
X = df_uni_without_iso_forest_outliers.drop('y_encoded', axis=1)
y = df_uni_without_iso_forest_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [None]:
svm = SVC(kernel='linear', probability=True) 

svm.fit(X_train, y_train)

svm_accuracy = svm.score(X_test, y_test)

svm_accuracy

In [39]:
lr = LogisticRegression(max_iter=1000)  

lr.fit(X_train, y_train)

lr_accuracy = lr.score(X_test, y_test)

lr_accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9146300636089723

In [40]:
rf = RandomForestClassifier(random_state=42)
    
rf.fit(X_train, y_train)

rf_accuracy = rf.score(X_test, y_test)

rf_accuracy 

0.9207398727820556

In [41]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8654168061600268

In [42]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9208235687981252

In [43]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.6257951121526615

In [44]:
# uni data without lof
X = df_uni_without_lof_outliers.drop('y_encoded', axis=1)
y = df_uni_without_lof_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [None]:
svm = SVC(kernel='linear', probability=True) 

svm.fit(X_train, y_train)

svm_accuracy = svm.score(X_test, y_test)

svm_accuracy

In [45]:
lr = LogisticRegression(max_iter=1000)  

lr.fit(X_train, y_train)

lr_accuracy = lr.score(X_test, y_test)

lr_accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9002402041735476

In [46]:
rf = RandomForestClassifier(random_state=42)
    
rf.fit(X_train, y_train)

rf_accuracy = rf.score(X_test, y_test)

rf_accuracy 

0.9065455637291698

In [47]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8467197117549917

In [48]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9092478606815794

In [49]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.8280288245008257

In [53]:
df_label_encoding = pd.DataFrame()

In [55]:
le = LabelEncoder()
    
for col in df.select_dtypes(include=['object']).columns:
    if col not in ['age', 'balance', 'duration']:  
        df_label_encoding[col] = le.fit_transform(df[col].astype(str))  

In [58]:
df_label = pd.concat([df_numer, df_label_encoding], axis = 1)

In [59]:
df_label.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,58,2143,5,261,1,-1,0,4,1,2,0,1,0,2,8,1,0
1,44,29,5,151,1,-1,0,9,2,1,0,1,0,2,8,1,0
2,33,2,5,76,1,-1,0,2,1,1,0,1,1,2,8,1,0
3,47,1506,5,92,1,-1,0,1,1,3,0,1,0,2,8,1,0
4,33,1,5,198,1,-1,0,11,2,3,0,0,0,2,8,1,0


In [60]:
df_label.shape

(45211, 17)

In [68]:
df_without_y = df.drop("y", axis=1)
df_without_y.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,others
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,others
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,others
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,others
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,others


In [81]:
categorical_columns_without_y = categorical_columns.drop("y")

In [83]:
categorical_columns_without_y

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

In [89]:
df_oh = pd.get_dummies(df_without_y, columns=categorical_columns_without_y, drop_first=True)

df_oh.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_others,poutcome_success
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,33,2,5,76,1,-1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
3,47,1506,5,92,1,-1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [90]:
df_oh.shape

(45211, 41)