In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pycaret.classification import *

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import BayesianGaussianMixture
import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

from imblearn.combine import SMOTETomek
from collections import Counter

In [4]:
# Load the dataset
filename = 'bank-full.csv'
df = pd.read_csv(filename, delimiter=';')
df_original = pd.read_csv(filename, delimiter=';')
print(df.head())
df.describe(include='all')

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,,,unknown,no
freq,,9732,27214,23202,44396,,25130,37967,29285,,13766,,,,,36959,39922
mean,40.93621,,,,,1362.272058,,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
std,10.618762,,,,,3044.765829,,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
min,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
25%,33.0,,,,,72.0,,,,8.0,,103.0,1.0,-1.0,0.0,,
50%,39.0,,,,,448.0,,,,16.0,,180.0,2.0,-1.0,0.0,,
75%,48.0,,,,,1428.0,,,,21.0,,319.0,3.0,-1.0,0.0,,


In [5]:






# Data Preprocessing







In [6]:
# Data Cleaning
missing_values = df.isnull().sum()

unknown_values = (df == 'unknown').sum()

# nan_values = (df == 'NaN').sum()

duplicates = df.duplicated().sum()

# print(missing_values)
# print(unknown_values)
# print(nan_values)
# print(duplicates)

In [7]:
categorical_columns = df.select_dtypes(include=['object']).columns

In [8]:
unique_values_info = {}
for col in categorical_columns:
    unique_counts = df[col].value_counts()
    unique_values_info[col] = unique_counts

In [9]:
# unique_values_info

In [10]:








# Unique encoding









In [11]:
df_uni_encoding = pd.DataFrame()

In [12]:
# Label encoding for 'education'
education_mapping = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
df_uni_encoding['education_encoded'] = df['education'].map(education_mapping)

In [13]:
# Label encoding for 'default', 'housing', 'loan', 'y' 
binary_mapping = {'no': 0, 'yes': 1}
columns_to_encode = ['default', 'housing', 'loan', 'y']

for col in columns_to_encode:
    df_uni_encoding[col + '_encoded'] = df[col].map(binary_mapping)

In [14]:
df['poutcome'] = df['poutcome'].replace(['unknown', 'other'], 'others')

# Label encoding for 'poutcome' with the specified mapping
poutcome_mapping = {'failure': 0, 'success': 1, 'others': -1}
df_uni_encoding['poutcome_encoded'] = df['poutcome'].map(poutcome_mapping)

In [15]:
# Frequency encoding for 'job' and 'month'
job_freq = df['job'].value_counts(normalize=True)
month_freq = df['month'].value_counts(normalize=True)
contact_freq = df['contact'].value_counts(normalize=True)
marital_freq = df['marital'].value_counts(normalize=True)

df_uni_encoding['job_encoded'] = df['job'].map(job_freq)
df_uni_encoding['month_encoded'] = df['month'].map(month_freq)
df_uni_encoding['contact_encoded'] = df['contact'].map(contact_freq)
df_uni_encoding['marital_encoded'] = df['marital'].map(marital_freq)

In [16]:
df_uni_encoding.head()

Unnamed: 0,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [17]:
encoded_columns = [
    'education_encoded', 'default_encoded', 'housing_encoded', 'loan_encoded', 
    'poutcome_encoded', 'job_encoded', 'month_encoded', 
    'contact_encoded', 'marital_encoded', 'y_encoded'
]

numeric_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

df_numer = df[numeric_columns]

df_numer.columns.tolist()

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [18]:
df_uni = pd.concat([df_numer, df_uni_encoding], axis = 1)

In [19]:
df_uni.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,58,2143,5,261,1,-1,0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,44,29,5,151,1,-1,0,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,33,2,5,76,1,-1,0,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,47,1506,5,92,1,-1,0,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,33,1,5,198,1,-1,0,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [20]:
df_uni.shape

(45211, 17)

In [21]:
df_uni_ori = df_uni.copy()

In [22]:
def score_calculation(pd, te):

    if len(pd) != len(te):
        
        accuracy_score = "Lengths Error"
        
    else:

        matches = sum([1 for pd,te in zip(pd, te) if pd == te])

    accuracy_score = matches / len(pd)
    
    return accuracy_score

In [23]:
def calculate_mcc(y_true, y_pred):

    mcc = matthews_corrcoef(y_true, y_pred)
    
    return mcc

In [24]:







# one-hot encoding









In [25]:
df_without_y = df.drop("y", axis=1)
df_without_y.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,others
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,others
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,others
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,others
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,others


In [26]:
categorical_columns_without_y = categorical_columns.drop("y")

In [27]:
categorical_columns_without_y

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

In [28]:
df_oh = pd.get_dummies(df_without_y, columns=categorical_columns_without_y, drop_first=True)

df_oh.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_others,poutcome_success
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,33,2,5,76,1,-1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
3,47,1506,5,92,1,-1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [29]:
df_oh.shape

(45211, 41)

In [30]:
df_oh = pd.concat([df_oh, df_uni['y_encoded']], axis = 1)

In [31]:
df_oh.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_others,poutcome_success,y_encoded
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,33,2,5,76,1,-1,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,47,1506,5,92,1,-1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [32]:
df_oh_ori = df_oh.copy()

In [33]:







# Outlier Detection for Unique Encoding







In [34]:
x_out = df_uni.copy()

x_out.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,58,2143,5,261,1,-1,0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,44,29,5,151,1,-1,0,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,33,2,5,76,1,-1,0,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,47,1506,5,92,1,-1,0,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,33,1,5,198,1,-1,0,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [35]:
df_uni.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,58,2143,5,261,1,-1,0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,44,29,5,151,1,-1,0,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,33,2,5,76,1,-1,0,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,47,1506,5,92,1,-1,0,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,33,1,5,198,1,-1,0,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [36]:
# Applying IQR
Q1 = x_out.quantile(0.25)
Q3 = x_out.quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((x_out < (Q1 - 1.5 * IQR)) | (x_out > (Q3 + 1.5 * IQR))).any(axis=1)

# Applying Isolation Forest
iso_forest = IsolationForest(random_state=527)
outliers_iso_forest = iso_forest.fit_predict(x_out) == -1

# Applying Local Outlier Factor
lof = LocalOutlierFactor()
outliers_lof = lof.fit_predict(x_out) == -1

# Counting the number of outliers
outliers_count = {
    "IQR": np.sum(outliers_iqr),
    "Isolation Forest": np.sum(outliers_iso_forest),
    "Local Outlier Factor": np.sum(outliers_lof)
}

In [37]:
outliers_count

{'IQR': 24751, 'Isolation Forest': 8301, 'Local Outlier Factor': 801}

In [38]:
# Encoded dataset without IQR detected outliers
df_uni_without_iqr_outliers = df_uni[~outliers_iqr]

# Encoded dataset without Isolation Forest detected outliers
df_uni_without_iso_forest_outliers = df_uni[~outliers_iso_forest]

# Encoded dataset without LOF detected outliers
df_uni_without_lof_outliers = df_uni[~outliers_lof]

datasets_shapes = {
    "Original Encoded Dataset": df_uni_ori.shape,
    "Without IQR Outliers": df_uni_without_iqr_outliers.shape,
    "Without Isolation Forest Outliers": df_uni_without_iso_forest_outliers.shape,
    "Without LOF Outliers": df_uni_without_lof_outliers.shape
}

In [39]:
datasets_shapes

{'Original Encoded Dataset': (45211, 17),
 'Without IQR Outliers': (20460, 17),
 'Without Isolation Forest Outliers': (36910, 17),
 'Without LOF Outliers': (44410, 17)}

In [40]:






# Outlier Detection for One-Hot Encoding








In [41]:
x_out = df_oh.copy()

x_out.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_others,poutcome_success,y_encoded
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,33,2,5,76,1,-1,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,47,1506,5,92,1,-1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [42]:
df_oh.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_others,poutcome_success,y_encoded
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,33,2,5,76,1,-1,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,47,1506,5,92,1,-1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [43]:
# Applying IQR
Q1 = x_out.quantile(0.25)
Q3 = x_out.quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((x_out < (Q1 - 1.5 * IQR)) | (x_out > (Q3 + 1.5 * IQR))).any(axis=1)

# Applying Isolation Forest
iso_forest = IsolationForest(random_state=527)
outliers_iso_forest = iso_forest.fit_predict(x_out) == -1

# Applying Local Outlier Factor
lof = LocalOutlierFactor()
outliers_lof = lof.fit_predict(x_out) == -1

# Counting the number of outliers
outliers_count = {
    "IQR": np.sum(outliers_iqr),
    "Isolation Forest": np.sum(outliers_iso_forest),
    "Local Outlier Factor": np.sum(outliers_lof)
}

In [44]:
outliers_count

{'IQR': 44163, 'Isolation Forest': 2488, 'Local Outlier Factor': 795}

In [45]:
# Encoded dataset without IQR detected outliers
df_oh_without_iqr_outliers = df_oh[~outliers_iqr]

# Encoded dataset without Isolation Forest detected outliers
df_oh_without_iso_forest_outliers = df_oh[~outliers_iso_forest]

# Encoded dataset without LOF detected outliers
df_oh_without_lof_outliers = df_oh[~outliers_lof]

datasets_shapes = {
    "Original Encoded Dataset": df_oh_ori.shape,
    "Without IQR Outliers": df_oh_without_iqr_outliers.shape,
    "Without Isolation Forest Outliers": df_oh_without_iso_forest_outliers.shape,
    "Without LOF Outliers": df_oh_without_lof_outliers.shape
}

In [46]:
datasets_shapes

{'Original Encoded Dataset': (45211, 42),
 'Without IQR Outliers': (1048, 42),
 'Without Isolation Forest Outliers': (42723, 42),
 'Without LOF Outliers': (44416, 42)}

In [47]:








# Unique Encoding Original Dataset








In [183]:
clf = setup(df_uni_ori, target = "y_encoded")

Unnamed: 0,Description,Value
0,Session id,4284
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(45211, 17)"
4,Transformed data shape,"(45211, 17)"
5,Transformed train set shape,"(31647, 17)"
6,Transformed test set shape,"(13564, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [184]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9094,0.9343,0.497,0.6478,0.5621,0.5126,0.5184,0.053
rf,Random Forest Classifier,0.9068,0.9272,0.433,0.6539,0.5208,0.4716,0.484,0.375
gbc,Gradient Boosting Classifier,0.9065,0.9216,0.4252,0.6556,0.5155,0.4663,0.4799,0.544
et,Extra Trees Classifier,0.9048,0.9231,0.3704,0.6676,0.476,0.4284,0.4511,0.251
ada,Ada Boost Classifier,0.9009,0.9076,0.3755,0.6278,0.4697,0.4188,0.4358,0.133
lr,Logistic Regression,0.8985,0.8713,0.3061,0.6379,0.4134,0.3653,0.3951,0.326
lda,Linear Discriminant Analysis,0.8983,0.8793,0.3984,0.5985,0.4782,0.4244,0.4353,0.013
ridge,Ridge Classifier,0.8953,0.0,0.1985,0.6817,0.3072,0.2686,0.3297,0.01
dummy,Dummy Classifier,0.883,0.5,0.0,0.0,0.0,0.0,0.0,0.007
knn,K Neighbors Classifier,0.8807,0.765,0.2661,0.4823,0.3426,0.283,0.2984,0.176


In [105]:
best_model

In [48]:
# uni encoded data
X = df_uni_ori.drop('y_encoded', axis=1)
y = df_uni_ori['y_encoded']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [50]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_accuracy = sgd.score(X_test, y_test)

print(f"Accuracy: {sgd_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.8627248599233265
Matthews Correlation Coefficient: -0.0380907036237233


In [51]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

y_pred = sgd_rbf.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

print(f"Accuracy: {sgd_rbf_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.8836626363904453
Matthews Correlation Coefficient: 0.0


In [185]:
model = LGBMClassifier(verbose = -100)
gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

Mean Best Score: 0.9088 ± 0.0009
Best parameters per fold: [{'learning_rate': 0.02, 'n_estimators': 400, 'num_leaves': 30}, {'learning_rate': 0.02, 'n_estimators': 400, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 80, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 80, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 30}]


In [186]:
model = LGBMClassifier(verbose = -100)
gridParams = {
    'learning_rate': [0.02, 0.1],
    'n_estimators': [80, 100, 400],
    'num_leaves': [30, 40],
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

Mean Best Score: 0.9088 ± 0.0009
Best parameters per fold: [{'learning_rate': 0.02, 'n_estimators': 400, 'num_leaves': 30}, {'learning_rate': 0.02, 'n_estimators': 400, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 80, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 80, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 30}]


In [187]:
model = LGBMClassifier(verbose = -100)
gridParams = {
    'learning_rate': [0.02, 0.1],
    'n_estimators': [80, 400],
    'num_leaves': [30, 40],
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

Mean Best Score: 0.9088 ± 0.0010
Best parameters per fold: [{'learning_rate': 0.02, 'n_estimators': 400, 'num_leaves': 30}, {'learning_rate': 0.02, 'n_estimators': 400, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 80, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 80, 'num_leaves': 40}, {'learning_rate': 0.1, 'n_estimators': 80, 'num_leaves': 30}]


In [188]:
optim_lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 40,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -100,
    'n_estimators': 80
}

In [189]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [190]:
lgbm = lgb.train(
    optim_lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[79]	training's l2: 0.049948	training's auc: 0.961218	valid_1's l2: 0.0577393	valid_1's auc: 0.941663


In [191]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

mcc = calculate_mcc(y_test, y_pred_binary)

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)

print(f"Accuracy: {lgbm_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.938066799380668
Matthews Correlation Coefficient: 0.6724446584781423


In [52]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 20, 30, None],
    'criterion':['gini', 'entropy']
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

(0.9066707923372995,
 0.0016161209659458487,
 [{'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 150},
  {'criterion': 'gini', 'max_depth': 30, 'n_estimators': 200},
  {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 150}])

In [53]:
model = HistGradientBoostingClassifier()

gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [30, 31, 32] 
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)  
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527) 

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params


(0.9087498727166734,
 0.0011123545142593372,
 [{'learning_rate': 0.1, 'max_iter': 100, 'max_leaf_nodes': 32},
  {'learning_rate': 0.05, 'max_iter': 200, 'max_leaf_nodes': 31},
  {'learning_rate': 0.1, 'max_iter': 200, 'max_leaf_nodes': 31}])

In [110]:







# Unique Encoding without IF Outlier Dataset








In [111]:
clf2 = setup(df_uni_without_iso_forest_outliers, target = "y_encoded")

Unnamed: 0,Description,Value
0,Session id,7358
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(36910, 17)"
4,Transformed data shape,"(36910, 17)"
5,Transformed train set shape,"(25837, 17)"
6,Transformed test set shape,"(11073, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [112]:
best_model2 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9522,0.9236,0.1558,0.6022,0.2465,0.2306,0.2889,0.163
rf,Random Forest Classifier,0.9521,0.9266,0.1527,0.5961,0.2422,0.2265,0.2845,0.248
lightgbm,Light Gradient Boosting Machine,0.9521,0.9379,0.2625,0.5545,0.3557,0.3342,0.3599,0.038
gbc,Gradient Boosting Classifier,0.951,0.9243,0.1474,0.5584,0.232,0.2154,0.2684,0.433
dummy,Dummy Classifier,0.9496,0.5,0.0,0.0,0.0,0.0,0.0,0.006
ridge,Ridge Classifier,0.9492,0.0,0.0015,0.05,0.003,0.0019,0.0046,0.007
lr,Logistic Regression,0.9482,0.8763,0.0645,0.4174,0.1114,0.0991,0.148,0.144
ada,Ada Boost Classifier,0.9475,0.9098,0.1712,0.445,0.2466,0.2251,0.2533,0.106
knn,K Neighbors Classifier,0.945,0.6491,0.0499,0.2653,0.0834,0.0685,0.0952,0.118
lda,Linear Discriminant Analysis,0.9421,0.8891,0.1604,0.3462,0.2186,0.1926,0.2085,0.012


In [113]:
best_model2

In [54]:
# uni data without isolation forest
X = df_uni_without_iso_forest_outliers.drop('y_encoded', axis=1)
y = df_uni_without_iso_forest_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [115]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_accuracy = sgd.score(X_test, y_test)

print(f"Accuracy: {sgd_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.9064390860652036
Matthews Correlation Coefficient: 0.057699563355799746


In [116]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

y_pred = sgd_rbf.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

print(f"Accuracy: {sgd_rbf_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.9499683915831302
Matthews Correlation Coefficient: 0.0


In [55]:
model = LGBMClassifier()

gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

[LightGBM] [Info] Number of positive: 1248, number of negative: 23358
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 941
[LightGBM] [Info] Number of data points in the train set: 24606, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.050719 -> initscore=-2.929397
[LightGBM] [Info] Start training from score -2.929397
[LightGBM] [Info] Number of positive: 1213, number of negative: 23394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 937
[LightGBM] [Info] Number of data points in the train set: 24607, number of used features: 16
[LightGBM] [Info] [bin

In [None]:
optim_lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 30,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'n_estimators': 80 
}

In [None]:
lgbm_model = lgb.train(
    optim_lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

In [None]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
lgbm_accuracy

In [56]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 20, 30, None],
    'criterion':['gini', 'entropy']
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

(0.952153898879691,
 0.0001376266123272232,
 [{'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200},
  {'criterion': 'gini', 'max_depth': 30, 'n_estimators': 150},
  {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 150}])

In [57]:
model = HistGradientBoostingClassifier()

gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [30, 31, 32] 
}

# Inner and outer cross-validation settings
inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing

best_scores = []
best_params = []

# Nested cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

(0.953481490585387,
 0.00016285887081530901,
 [{'learning_rate': 0.1, 'max_iter': 100, 'max_leaf_nodes': 32},
  {'learning_rate': 0.1, 'max_iter': 200, 'max_leaf_nodes': 31},
  {'learning_rate': 0.05, 'max_iter': 100, 'max_leaf_nodes': 30}])

In [117]:








# Unique Encoding without LOF Outlier Dataset








In [118]:
clf3 = setup(df_uni_without_lof_outliers, target = 'y_encoded')

Unnamed: 0,Description,Value
0,Session id,3929
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(44410, 17)"
4,Transformed data shape,"(44410, 17)"
5,Transformed train set shape,"(31086, 17)"
6,Transformed test set shape,"(13324, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [119]:
best_model3 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9111,0.9338,0.4775,0.6443,0.5477,0.4997,0.507,0.05
gbc,Gradient Boosting Classifier,0.9084,0.9204,0.4202,0.6451,0.5083,0.4603,0.4735,0.552
rf,Random Forest Classifier,0.908,0.9284,0.4208,0.6419,0.5075,0.4593,0.4722,0.35
et,Extra Trees Classifier,0.905,0.9228,0.3544,0.645,0.4568,0.4097,0.4321,0.233
ada,Ada Boost Classifier,0.9011,0.9072,0.361,0.6042,0.4514,0.4009,0.4173,0.132
lda,Linear Discriminant Analysis,0.901,0.8796,0.3963,0.5927,0.4744,0.4222,0.4329,0.012
lr,Logistic Regression,0.9002,0.8716,0.2949,0.6222,0.3996,0.3528,0.3824,0.118
ridge,Ridge Classifier,0.8967,0.0,0.1755,0.6609,0.2768,0.2409,0.304,0.008
dummy,Dummy Classifier,0.8871,0.5,0.0,0.0,0.0,0.0,0.0,0.007
knn,K Neighbors Classifier,0.8839,0.7565,0.2516,0.4752,0.3284,0.2714,0.2882,0.165


In [120]:
best_model3

In [58]:
# uni data without lof
X = df_uni_without_lof_outliers.drop('y_encoded', axis=1)
y = df_uni_without_lof_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [122]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_accuracy = sgd.score(X_test, y_test)

print(f"Accuracy: {sgd_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.856413720633491
Matthews Correlation Coefficient: -0.011948821739090839


In [123]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

y_pred = sgd_rbf.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

print(f"Accuracy: {sgd_rbf_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.8877880357276889
Matthews Correlation Coefficient: 0.0


In [59]:
model = LGBMClassifier()
gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

[LightGBM] [Info] Number of positive: 3355, number of negative: 26251
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 29606, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113322 -> initscore=-2.057252
[LightGBM] [Info] Start training from score -2.057252
[LightGBM] [Info] Number of positive: 3304, number of negative: 26303
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 29607, number of used features: 16
[LightGBM] [Info] [bin

In [117]:
optim_lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 30,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'n_estimators': 80 
}

In [118]:
lgbm_model = lgb.train(
    optim_lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[75]	training's l2: 0.0503118	training's auc: 0.95715	valid_1's l2: 0.0608292	valid_1's auc: 0.936614


In [119]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
lgbm_accuracy

0.9291264103776772

In [60]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 20, 30, None],
    'criterion':['gini', 'entropy']
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

(0.9103918803293972,
 0.0006680249099080734,
 [{'criterion': 'gini', 'max_depth': 20, 'n_estimators': 150},
  {'criterion': 'gini', 'max_depth': 30, 'n_estimators': 150},
  {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 150}])

In [61]:
model = HistGradientBoostingClassifier()

gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [30, 31, 32] 
}

# Inner and outer cross-validation settings
inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing

best_scores = []
best_params = []

# Nested cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

(0.9114501163552097,
 0.0003501057794862471,
 [{'learning_rate': 0.1, 'max_iter': 200, 'max_leaf_nodes': 31},
  {'learning_rate': 0.05, 'max_iter': 200, 'max_leaf_nodes': 31},
  {'learning_rate': 0.1, 'max_iter': 200, 'max_leaf_nodes': 30}])

In [125]:








# One-Hot Encoding Original Dataset








In [126]:
clf4 = setup(df_oh_ori, target = 'y_encoded')

Unnamed: 0,Description,Value
0,Session id,8585
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(45211, 42)"
4,Transformed data shape,"(45211, 42)"
5,Transformed train set shape,"(31647, 42)"
6,Transformed test set shape,"(13564, 42)"
7,Numeric features,41
8,Preprocess,True
9,Imputation type,simple


In [127]:
best_model4 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9074,0.9356,0.4908,0.6354,0.5536,0.503,0.5083,0.071
gbc,Gradient Boosting Classifier,0.9067,0.9251,0.4168,0.661,0.511,0.4623,0.4775,0.658
rf,Random Forest Classifier,0.9043,0.9278,0.3928,0.651,0.4894,0.4402,0.4576,0.349
et,Extra Trees Classifier,0.9008,0.915,0.3506,0.6397,0.4527,0.4032,0.4255,0.342
lda,Linear Discriminant Analysis,0.9005,0.907,0.4408,0.6025,0.5088,0.455,0.462,0.043
ada,Ada Boost Classifier,0.8998,0.9107,0.3809,0.6165,0.4707,0.4188,0.4337,0.183
lr,Logistic Regression,0.8997,0.8985,0.3268,0.6405,0.4324,0.3837,0.4101,0.449
ridge,Ridge Classifier,0.8994,0.0,0.2788,0.6684,0.3933,0.3484,0.3885,0.023
dummy,Dummy Classifier,0.883,0.5,0.0,0.0,0.0,0.0,0.0,0.021
knn,K Neighbors Classifier,0.8804,0.76,0.262,0.4791,0.3385,0.279,0.2945,0.17


In [128]:
best_model4

In [129]:
# uni data without lof
X = df_oh_ori.drop('y_encoded', axis=1)
y = df_oh_ori['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [130]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_accuracy = sgd.score(X_test, y_test)

print(f"Accuracy: {sgd_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.8850634031259216
Matthews Correlation Coefficient: 0.13370803121813407


In [131]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

y_pred = sgd_rbf.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

print(f"Accuracy: {sgd_rbf_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.8836626363904453
Matthews Correlation Coefficient: 0.0


In [None]:
model = LGBMClassifier()

gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

In [None]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 20, 30, None],
    'criterion':['gini', 'entropy']
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

In [None]:
model = HistGradientBoostingClassifier()

gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [30, 31, 32] 
}

# Inner and outer cross-validation settings
inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing

best_scores = []
best_params = []

# Nested cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

In [132]:










# One-Hot Encoding without LOF Outlier Dataset










In [133]:
clf5 = setup(df_oh_without_lof_outliers, target = 'y_encoded')

Unnamed: 0,Description,Value
0,Session id,8892
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(44416, 42)"
4,Transformed data shape,"(44416, 42)"
5,Transformed train set shape,"(31091, 42)"
6,Transformed test set shape,"(13325, 42)"
7,Numeric features,41
8,Preprocess,True
9,Imputation type,simple


In [134]:
best_model5 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9101,0.9341,0.4903,0.6316,0.5519,0.5028,0.5079,0.068
gbc,Gradient Boosting Classifier,0.9093,0.9239,0.4088,0.6592,0.5045,0.4576,0.4736,0.66
rf,Random Forest Classifier,0.9069,0.9263,0.3789,0.6524,0.4786,0.4314,0.4511,0.351
lr,Logistic Regression,0.9028,0.9018,0.3254,0.6377,0.4296,0.3827,0.4093,0.434
lda,Linear Discriminant Analysis,0.9027,0.9064,0.4453,0.5932,0.5083,0.4556,0.4616,0.043
ridge,Ridge Classifier,0.9026,0.0,0.2689,0.672,0.3835,0.3411,0.3839,0.024
ada,Ada Boost Classifier,0.9026,0.9092,0.3735,0.6135,0.4641,0.414,0.4296,0.177
et,Extra Trees Classifier,0.9022,0.9129,0.3333,0.6271,0.4348,0.3867,0.4102,0.33
dummy,Dummy Classifier,0.8871,0.5,0.0,0.0,0.0,0.0,0.0,0.021
knn,K Neighbors Classifier,0.884,0.7626,0.2613,0.4753,0.3369,0.2792,0.2944,0.163


In [135]:
best_model5

In [139]:
# uni data without lof
X = df_oh_without_lof_outliers.drop('y_encoded', axis=1)
y = df_oh_without_lof_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [140]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_accuracy = sgd.score(X_test, y_test)

print(f"Accuracy: {sgd_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.7758348968105065
Matthews Correlation Coefficient: 0.23376431057501018


In [141]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

y_pred = sgd_rbf.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

print(f"Accuracy: {sgd_rbf_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.8894559099437148
Matthews Correlation Coefficient: 0.0


In [None]:
model = LGBMClassifier()

gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

In [None]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 20, 30, None],
    'criterion':['gini', 'entropy']
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

In [None]:
model = HistGradientBoostingClassifier()

gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [30, 31, 32] 
}

# Inner and outer cross-validation settings
inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing

best_scores = []
best_params = []

# Nested cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

In [None]:









# One-Hot Encoding without IF Outlier Dataset









In [77]:
clf6 = setup(df_oh_without_iso_forest_outliers, target = 'y_encoded')

Unnamed: 0,Description,Value
0,Session id,5376
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(42723, 42)"
4,Transformed data shape,"(42723, 42)"
5,Transformed train set shape,"(29906, 42)"
6,Transformed test set shape,"(12817, 42)"
7,Numeric features,41
8,Preprocess,True
9,Imputation type,simple


In [78]:
best_model6 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9219,0.9279,0.3394,0.5787,0.4277,0.3888,0.4049,0.056
rf,Random Forest Classifier,0.92,0.9168,0.2233,0.593,0.324,0.2908,0.3306,0.349
gbc,Gradient Boosting Classifier,0.9196,0.915,0.2533,0.5742,0.351,0.3151,0.3456,0.614
lr,Logistic Regression,0.9181,0.8858,0.1981,0.5703,0.2936,0.2609,0.3028,0.414
et,Extra Trees Classifier,0.9178,0.9056,0.2129,0.5597,0.3076,0.2733,0.3102,0.313
ridge,Ridge Classifier,0.9172,0.0,0.1084,0.6095,0.1836,0.1619,0.2322,0.013
lda,Linear Discriminant Analysis,0.9158,0.8955,0.3631,0.5152,0.4256,0.3817,0.3887,0.034
ada,Ada Boost Classifier,0.9156,0.8998,0.2754,0.5173,0.359,0.3184,0.3368,0.159
dummy,Dummy Classifier,0.9139,0.5,0.0,0.0,0.0,0.0,0.0,0.012
knn,K Neighbors Classifier,0.9096,0.7223,0.1903,0.4402,0.2654,0.2254,0.2482,0.143


In [79]:
best_model6

In [145]:
# uni data without if
X = df_oh_without_iso_forest_outliers.drop('y_encoded', axis=1)
y = df_oh_without_iso_forest_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [146]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_accuracy = sgd.score(X_test, y_test)

print(f"Accuracy: {sgd_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.8264024342669891
Matthews Correlation Coefficient: 0.33616495820540965


In [147]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

y_pred = sgd_rbf.predict(X_test)
mcc = calculate_mcc(y_test, y_pred)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

print(f"Accuracy: {sgd_rbf_accuracy}")
print(f"Matthews Correlation Coefficient: {mcc}")

Accuracy: 0.9130841850667083
Matthews Correlation Coefficient: 0.0


In [None]:
model = LGBMClassifier()

gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

In [None]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [10, 20, 30, None],
    'criterion':['gini', 'entropy']
}

inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

In [None]:
model = HistGradientBoostingClassifier()

gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [30, 31, 32] 
}

# Inner and outer cross-validation settings
inner_cv = KFold(n_splits=8, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing

best_scores = []
best_params = []

# Nested cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params