In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pycaret.classification import *

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import BayesianGaussianMixture

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the dataset
filename = '/Users/russelwilson/Desktop/bank-full.csv'
df = pd.read_csv(filename, delimiter=';')
df_original = pd.read_csv(filename, delimiter=';')
print(df.head())
df.describe(include='all')

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,,,unknown,no
freq,,9732,27214,23202,44396,,25130,37967,29285,,13766,,,,,36959,39922
mean,40.93621,,,,,1362.272058,,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
std,10.618762,,,,,3044.765829,,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
min,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
25%,33.0,,,,,72.0,,,,8.0,,103.0,1.0,-1.0,0.0,,
50%,39.0,,,,,448.0,,,,16.0,,180.0,2.0,-1.0,0.0,,
75%,48.0,,,,,1428.0,,,,21.0,,319.0,3.0,-1.0,0.0,,


In [3]:
# Data Cleaning
missing_values = df.isnull().sum()

unknown_values = (df == 'unknown').sum()

# nan_values = (df == 'NaN').sum()

duplicates = df.duplicated().sum()

# print(missing_values)
# print(unknown_values)
# print(nan_values)
# print(duplicates)

In [4]:
categorical_columns = df.select_dtypes(include=['object']).columns

In [5]:
unique_values_info = {}
for col in categorical_columns:
    unique_counts = df[col].value_counts()
    unique_values_info[col] = unique_counts

In [6]:
# unique_values_info

In [7]:
df_uni_encoding = pd.DataFrame()

In [8]:
# Label encoding for 'education'
education_mapping = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
df_uni_encoding['education_encoded'] = df['education'].map(education_mapping)

In [9]:
# Label encoding for 'default', 'housing', 'loan', 'y' 
binary_mapping = {'no': 0, 'yes': 1}
columns_to_encode = ['default', 'housing', 'loan', 'y']

for col in columns_to_encode:
    df_uni_encoding[col + '_encoded'] = df[col].map(binary_mapping)

In [10]:
df['poutcome'] = df['poutcome'].replace(['unknown', 'other'], 'others')

# Label encoding for 'poutcome' with the specified mapping
poutcome_mapping = {'failure': 0, 'success': 1, 'others': -1}
df_uni_encoding['poutcome_encoded'] = df['poutcome'].map(poutcome_mapping)

In [11]:
# Frequency encoding for 'job' and 'month'
job_freq = df['job'].value_counts(normalize=True)
month_freq = df['month'].value_counts(normalize=True)
contact_freq = df['contact'].value_counts(normalize=True)
marital_freq = df['marital'].value_counts(normalize=True)

df_uni_encoding['job_encoded'] = df['job'].map(job_freq)
df_uni_encoding['month_encoded'] = df['month'].map(month_freq)
df_uni_encoding['contact_encoded'] = df['contact'].map(contact_freq)
df_uni_encoding['marital_encoded'] = df['marital'].map(marital_freq)

In [12]:
df_uni_encoding.head()

Unnamed: 0,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [13]:
encoded_columns = [
    'education_encoded', 'default_encoded', 'housing_encoded', 'loan_encoded', 
    'poutcome_encoded', 'job_encoded', 'month_encoded', 
    'contact_encoded', 'marital_encoded', 'y_encoded'
]

numeric_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

df_numer = df[numeric_columns]

df_numer.columns.tolist()

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [14]:
df_uni = pd.concat([df_numer, df_uni_encoding], axis = 1)

In [15]:
df_uni.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,58,2143,5,261,1,-1,0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,44,29,5,151,1,-1,0,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,33,2,5,76,1,-1,0,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,47,1506,5,92,1,-1,0,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,33,1,5,198,1,-1,0,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [16]:
df_uni.shape

(45211, 17)

In [22]:
def score_calculation(pd, te):

    if len(pd) != len(te):
        
        accuracy_score = "Lengths Error"
        
    else:

        matches = sum([1 for pd,te in zip(pd, te) if pd == te])

    accuracy_score = matches / len(pd)
    
    return accuracy_score

In [33]:
clf = setup(df_uni_ori, target = "y_encoded")

Unnamed: 0,Description,Value
0,Session id,8905
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(45211, 17)"
4,Transformed data shape,"(45211, 17)"
5,Transformed train set shape,"(31647, 17)"
6,Transformed test set shape,"(13564, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [34]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9093,0.9334,0.4886,0.6496,0.5574,0.508,0.5146,0.051
rf,Random Forest Classifier,0.9063,0.9283,0.4273,0.6521,0.516,0.4666,0.4796,0.36
gbc,Gradient Boosting Classifier,0.9053,0.9201,0.4233,0.6457,0.511,0.4611,0.4739,0.553
et,Extra Trees Classifier,0.9036,0.9224,0.3679,0.6579,0.4714,0.423,0.445,0.24
ada,Ada Boost Classifier,0.8991,0.9066,0.366,0.6165,0.4589,0.4073,0.4244,0.134
lr,Logistic Regression,0.8983,0.8727,0.2993,0.64,0.4074,0.3597,0.3912,0.344
lda,Linear Discriminant Analysis,0.8979,0.8798,0.3865,0.5995,0.4695,0.4159,0.4285,0.013
ridge,Ridge Classifier,0.894,0.0,0.1802,0.6777,0.2844,0.2474,0.3123,0.008
dummy,Dummy Classifier,0.883,0.5,0.0,0.0,0.0,0.0,0.0,0.007
knn,K Neighbors Classifier,0.8813,0.7634,0.2696,0.4874,0.3468,0.2874,0.3029,0.173


In [35]:
best_model

In [45]:
df_label_encoding = pd.DataFrame()

In [46]:
le = LabelEncoder()
    
for col in df.select_dtypes(include=['object']).columns:
    if col not in ['age', 'balance', 'duration']:  
        df_label_encoding[col] = le.fit_transform(df[col].astype(str))  

In [47]:
df_label = pd.concat([df_numer, df_label_encoding], axis = 1)

In [48]:
df_label.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,58,2143,5,261,1,-1,0,4,1,2,0,1,0,2,8,1,0
1,44,29,5,151,1,-1,0,9,2,1,0,1,0,2,8,1,0
2,33,2,5,76,1,-1,0,2,1,1,0,1,1,2,8,1,0
3,47,1506,5,92,1,-1,0,1,1,3,0,1,0,2,8,1,0
4,33,1,5,198,1,-1,0,11,2,3,0,0,0,2,8,1,0


In [49]:
df_label.shape

(45211, 17)

In [50]:
clf4 = setup(df_label, target = 'y')

Unnamed: 0,Description,Value
0,Session id,2572
1,Target,y
2,Target type,Binary
3,Original data shape,"(45211, 17)"
4,Transformed data shape,"(45211, 17)"
5,Transformed train set shape,"(31647, 17)"
6,Transformed test set shape,"(13564, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [51]:
best_model4 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.907,0.9326,0.4706,0.6396,0.5417,0.4913,0.4987,0.051
rf,Random Forest Classifier,0.9059,0.9255,0.4244,0.6497,0.5132,0.4636,0.4766,0.383
gbc,Gradient Boosting Classifier,0.9053,0.9212,0.4068,0.6531,0.5009,0.4517,0.4674,0.554
et,Extra Trees Classifier,0.9045,0.9211,0.3593,0.6721,0.4678,0.4206,0.4458,0.257
ada,Ada Boost Classifier,0.9,0.9065,0.3668,0.6236,0.4613,0.4103,0.4282,0.138
lda,Linear Discriminant Analysis,0.8989,0.8724,0.3979,0.6034,0.4791,0.4258,0.4373,0.015
ridge,Ridge Classifier,0.8941,0.0,0.1818,0.6764,0.286,0.2488,0.3131,0.008
lr,Logistic Regression,0.8933,0.8547,0.2528,0.6053,0.3561,0.3087,0.3441,0.203
dummy,Dummy Classifier,0.883,0.5,0.0,0.0,0.0,0.0,0.0,0.007
knn,K Neighbors Classifier,0.8815,0.76,0.2639,0.4876,0.3424,0.2836,0.2998,0.169


In [52]:
best_model4

In [53]:
df_without_y = df.drop("y", axis=1)
df_without_y.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,others
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,others
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,others
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,others
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,others


In [54]:
categorical_columns_without_y = categorical_columns.drop("y")

In [55]:
categorical_columns_without_y

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

In [56]:
df_oh = pd.get_dummies(df_without_y, columns=categorical_columns_without_y, drop_first=True)

df_oh.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_others,poutcome_success
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,33,2,5,76,1,-1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
3,47,1506,5,92,1,-1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [90]:
df_oh.shape

(45211, 41)

In [57]:
df_oh = pd.concat([df_oh, df_uni['y_encoded']], axis = 1)

In [58]:
df_oh.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_others,poutcome_success,y_encoded
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,33,2,5,76,1,-1,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,47,1506,5,92,1,-1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [59]:
clf5 = setup(df_oh, target = 'y_encoded')

Unnamed: 0,Description,Value
0,Session id,838
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(45211, 42)"
4,Transformed data shape,"(45211, 42)"
5,Transformed train set shape,"(31647, 42)"
6,Transformed test set shape,"(13564, 42)"
7,Numeric features,41
8,Preprocess,True
9,Imputation type,simple


In [60]:
best_model5 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9086,0.9348,0.4965,0.641,0.5593,0.5093,0.5146,0.069
gbc,Gradient Boosting Classifier,0.9064,0.9242,0.4111,0.6612,0.5067,0.4582,0.4741,0.651
rf,Random Forest Classifier,0.9053,0.9274,0.3901,0.6615,0.4902,0.4418,0.4609,0.359
et,Extra Trees Classifier,0.9014,0.9146,0.3568,0.6418,0.4583,0.409,0.4305,0.335
lda,Linear Discriminant Analysis,0.9001,0.905,0.4346,0.6011,0.5043,0.4504,0.4578,0.042
lr,Logistic Regression,0.9,0.8972,0.3336,0.6395,0.4381,0.3892,0.4143,0.445
ada,Ada Boost Classifier,0.8995,0.9082,0.3671,0.6188,0.4606,0.4091,0.4263,0.177
ridge,Ridge Classifier,0.8989,0.0,0.2717,0.6668,0.3858,0.3412,0.3826,0.024
dummy,Dummy Classifier,0.883,0.5,0.0,0.0,0.0,0.0,0.0,0.023
knn,K Neighbors Classifier,0.8815,0.7598,0.2631,0.488,0.3414,0.2828,0.2993,0.172


In [61]:
best_model5

In [91]:
# uni encoded data
X = df_uni_ori.drop('y_encoded', axis=1)
y = df_uni_ori['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [92]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [93]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [94]:
lgbm_model = lgb.train(
    lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[90]	training's l2: 0.0552684	training's auc: 0.950621	valid_1's l2: 0.0622916	valid_1's auc: 0.932679


In [95]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
lgbm_accuracy

0.9075493954585668

In [None]:
# svm = SVC(kernel='linear', probability=True) 

svm.fit(X_train, y_train)

svm_accuracy = svm.score(X_test, y_test)

svm_accuracy

In [24]:
lr = LogisticRegression(max_iter=1000)  

lr.fit(X_train, y_train)

lr_accuracy = lr.score(X_test, y_test)

lr_accuracy

0.8973016809200826

In [25]:
rf = RandomForestClassifier(random_state=42)
    
rf.fit(X_train, y_train)

rf_accuracy = rf.score(X_test, y_test)

rf_accuracy 

0.9070333235033913

In [26]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8406812149808316

In [27]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9076968445886169

In [28]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.8627248599233265

In [29]:
sgd_rbf = make_pipeline(RBFSampler(gamma=1, random_state=527), SGDClassifier(random_state=527))

sgd_rbf.fit(X_train, y_train)

sgd_rbf_accuracy = sgd_rbf.score(X_test, y_test)

sgd_rbf_accuracy

0.8836626363904453

In [36]:
df_uni.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,58,2143,5,261,1,-1,0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,44,29,5,151,1,-1,0,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,33,2,5,76,1,-1,0,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,47,1506,5,92,1,-1,0,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,33,1,5,198,1,-1,0,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [37]:
df_uni_ori.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,education_encoded,default_encoded,housing_encoded,loan_encoded,y_encoded,poutcome_encoded,job_encoded,month_encoded,contact_encoded,marital_encoded
0,58,2143,5,261,1,-1,0,3,0,1,0,0,-1,0.209197,0.304483,0.287983,0.601933
1,44,29,5,151,1,-1,0,2,0,1,0,0,-1,0.168034,0.304483,0.287983,0.282896
2,33,2,5,76,1,-1,0,2,0,1,1,0,-1,0.03289,0.304483,0.287983,0.601933
3,47,1506,5,92,1,-1,0,0,0,1,0,0,-1,0.215257,0.304483,0.287983,0.601933
4,33,1,5,198,1,-1,0,0,0,0,0,0,-1,0.00637,0.304483,0.287983,0.282896


In [None]:
model = LGBMClassifier()
gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

In [None]:
optim_lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 30,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'n_estimators': 80 
}

# lgbm_model = LGBMClassifier(learning_rate=0.1, n_estimators=80, num_leaves=30)

# lgbm_model.fit(X_train, y_train)

# y_pred = lgbm_model.predict(X_test)

# optim_accuracy = accuracy_score(y_test, y_pred)

# optim_accuracy

In [None]:
lgbm_model = lgb.train(
    optim_lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

In [None]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
lgbm_accuracy

In [None]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

In [None]:
model = HistGradientBoostingClassifier()

# Grid parameters for HistGradientBoostingClassifier
gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [20, 30, 40],
    'min_samples_leaf': [10, 20, 30]
}

# Inner and outer cross-validation settings
inner_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing

best_scores = []
best_params = []

# Nested cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params


In [17]:
x_out = df_uni[numeric_columns]

x_out.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0


In [18]:
# Applying IQR
Q1 = x_out.quantile(0.25)
Q3 = x_out.quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((x_out < (Q1 - 1.5 * IQR)) | (x_out > (Q3 + 1.5 * IQR))).any(axis=1)

# Applying Isolation Forest
iso_forest = IsolationForest(random_state=527)
outliers_iso_forest = iso_forest.fit_predict(x_out) == -1

# Applying Local Outlier Factor
lof = LocalOutlierFactor()
outliers_lof = lof.fit_predict(x_out) == -1

# Counting the number of outliers
outliers_count = {
    "IQR": np.sum(outliers_iqr),
    "Isolation Forest": np.sum(outliers_iso_forest),
    "Local Outlier Factor": np.sum(outliers_lof)
}

In [19]:
outliers_count

{'IQR': 17018, 'Isolation Forest': 5387, 'Local Outlier Factor': 807}

In [20]:
# Original encoded dataset
df_uni_ori = df_uni.copy()

# Encoded dataset without IQR detected outliers
df_uni_without_iqr_outliers = df_uni[~outliers_iqr]

# Encoded dataset without Isolation Forest detected outliers
df_uni_without_iso_forest_outliers = df_uni[~outliers_iso_forest]

# Encoded dataset without LOF detected outliers
df_uni_without_lof_outliers = df_uni[~outliers_lof]

datasets_shapes = {
    "Original Encoded Dataset": df_uni_ori.shape,
    "Without IQR Outliers": df_uni_without_iqr_outliers.shape,
    "Without Isolation Forest Outliers": df_uni_without_iso_forest_outliers.shape,
    "Without LOF Outliers": df_uni_without_lof_outliers.shape
}

In [21]:
datasets_shapes

{'Original Encoded Dataset': (45211, 17),
 'Without IQR Outliers': (28193, 17),
 'Without Isolation Forest Outliers': (39824, 17),
 'Without LOF Outliers': (44404, 17)}

In [96]:
# uni data without isolation forest
X = df_uni_without_iso_forest_outliers.drop('y_encoded', axis=1)
y = df_uni_without_iso_forest_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [97]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [98]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [99]:
lgbm_model = lgb.train(
    lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[99]	training's l2: 0.0465179	training's auc: 0.957572	valid_1's l2: 0.053638	valid_1's auc: 0.930089


In [100]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
lgbm_accuracy

0.9235018413123536

In [None]:
# svm = SVC(kernel='linear', probability=True) 

svm.fit(X_train, y_train)

svm_accuracy = svm.score(X_test, y_test)

svm_accuracy

In [39]:
lr = LogisticRegression(max_iter=1000)  

lr.fit(X_train, y_train)

lr_accuracy = lr.score(X_test, y_test)

lr_accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9146300636089723

In [40]:
rf = RandomForestClassifier(random_state=42)
    
rf.fit(X_train, y_train)

rf_accuracy = rf.score(X_test, y_test)

rf_accuracy 

0.9207398727820556

In [41]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8654168061600268

In [42]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9208235687981252

In [43]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.6257951121526615

In [38]:
clf2 = setup(df_uni_without_iso_forest_outliers, target = "y_encoded")

Unnamed: 0,Description,Value
0,Session id,4995
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(39824, 17)"
4,Transformed data shape,"(39824, 17)"
5,Transformed train set shape,"(27876, 17)"
6,Transformed test set shape,"(11948, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [39]:
best_model2 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9203,0.9345,0.433,0.6243,0.5111,0.4692,0.4787,0.043
rf,Random Forest Classifier,0.919,0.9272,0.3692,0.6377,0.4675,0.4271,0.4459,0.305
gbc,Gradient Boosting Classifier,0.9187,0.9237,0.3663,0.6357,0.4643,0.4238,0.4429,0.475
et,Extra Trees Classifier,0.9161,0.9211,0.3025,0.6355,0.4098,0.3708,0.4006,0.202
ada,Ada Boost Classifier,0.9144,0.9091,0.3447,0.5967,0.4363,0.3937,0.4114,0.117
lr,Logistic Regression,0.9131,0.8756,0.2735,0.6091,0.3769,0.3375,0.3692,0.171
ridge,Ridge Classifier,0.9116,0.0,0.149,0.6888,0.2447,0.218,0.2926,0.008
lda,Linear Discriminant Analysis,0.9106,0.8791,0.3916,0.5503,0.4572,0.4101,0.4173,0.011
dummy,Dummy Classifier,0.9037,0.5,0.0,0.0,0.0,0.0,0.0,0.006
knn,K Neighbors Classifier,0.8997,0.7355,0.2079,0.4548,0.285,0.239,0.2605,0.134


In [40]:
best_model2

In [None]:
model = LGBMClassifier()
gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

In [None]:
optim_lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 30,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'n_estimators': 80 
}

# lgbm_model = LGBMClassifier(learning_rate=0.1, n_estimators=80, num_leaves=30)

# lgbm_model.fit(X_train, y_train)

# y_pred = lgbm_model.predict(X_test)

# optim_accuracy = accuracy_score(y_test, y_pred)

# optim_accuracy

In [None]:
lgbm_model = lgb.train(
    optim_lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

In [None]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
lgbm_accuracy

In [None]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

In [None]:
model = HistGradientBoostingClassifier()

# Grid parameters for HistGradientBoostingClassifier
gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [20, 30, 40],
    'min_samples_leaf': [10, 20, 30]
}

# Inner and outer cross-validation settings
inner_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing

best_scores = []
best_params = []

# Nested cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params


In [101]:
# uni data without lof
X = df_uni_without_lof_outliers.drop('y_encoded', axis=1)
y = df_uni_without_lof_outliers['y_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=527)

In [102]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [103]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [104]:
lgbm_model = lgb.train(
    lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l2: 0.0532571	training's auc: 0.952469	valid_1's l2: 0.0607098	valid_1's auc: 0.937033


In [105]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
lgbm_accuracy

0.9103738177450833

In [None]:
# svm = SVC(kernel='linear', probability=True) 

svm.fit(X_train, y_train)

svm_accuracy = svm.score(X_test, y_test)

svm_accuracy

In [45]:
lr = LogisticRegression(max_iter=1000)  

lr.fit(X_train, y_train)

lr_accuracy = lr.score(X_test, y_test)

lr_accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9002402041735476

In [46]:
rf = RandomForestClassifier(random_state=42)
    
rf.fit(X_train, y_train)

rf_accuracy = rf.score(X_test, y_test)

rf_accuracy 

0.9065455637291698

In [47]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb_accuracy = gnb.score(X_test, y_test)

gnb_accuracy

0.8467197117549917

In [48]:
hgb = HistGradientBoostingClassifier(random_state=527)

hgb.fit(X_train, y_train)

hgb_accuracy = hgb.score(X_test, y_test)

hgb_accuracy

0.9092478606815794

In [49]:
sgd = SGDClassifier(random_state=527)

sgd.fit(X_train, y_train)

sgd_accuracy = sgd.score(X_test, y_test)

sgd_accuracy

0.8280288245008257

In [41]:
clf3 = setup(df_uni_without_lof_outliers, target = 'y_encoded')

Unnamed: 0,Description,Value
0,Session id,1121
1,Target,y_encoded
2,Target type,Binary
3,Original data shape,"(44404, 17)"
4,Transformed data shape,"(44404, 17)"
5,Transformed train set shape,"(31082, 17)"
6,Transformed test set shape,"(13322, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [42]:
best_model3 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9108,0.9363,0.4855,0.6379,0.551,0.5026,0.5085,0.05
rf,Random Forest Classifier,0.909,0.9311,0.4251,0.6483,0.5132,0.4654,0.4782,0.359
gbc,Gradient Boosting Classifier,0.9078,0.9221,0.4185,0.6403,0.5058,0.4575,0.4703,0.534
et,Extra Trees Classifier,0.9067,0.9249,0.3553,0.6618,0.4622,0.4161,0.4404,0.235
ada,Ada Boost Classifier,0.9027,0.9082,0.3692,0.616,0.4611,0.4114,0.428,0.134
lda,Linear Discriminant Analysis,0.9008,0.8804,0.3906,0.5929,0.4704,0.4183,0.4296,0.012
lr,Logistic Regression,0.9003,0.8726,0.2917,0.6258,0.3975,0.351,0.3818,0.151
ridge,Ridge Classifier,0.8967,0.0,0.1701,0.668,0.2705,0.2355,0.3011,0.008
dummy,Dummy Classifier,0.8871,0.5,0.0,0.0,0.0,0.0,0.0,0.006
knn,K Neighbors Classifier,0.8845,0.7535,0.2538,0.4779,0.3312,0.2745,0.2912,0.165


In [43]:
best_model3

In [109]:
model = LGBMClassifier()
gridParams = {
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.1],
    'n_estimators': [40, 80, 100, 200, 400],
    'num_leaves': [20, 30, 40, 50, 70, 100],
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

[LightGBM] [Info] Number of positive: 2669, number of negative: 21013
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002306 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 23682, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.112702 -> initscore=-2.063437
[LightGBM] [Info] Start training from score -2.063437
[LightGBM] [Info] Number of positive: 2669, number of negative: 21013
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 23682, number of used features: 16
[LightGBM] [Info] [bin

In [112]:
model = LGBMClassifier()
gridParams = {
    'learning_rate': [0.02, 0.1],
    'n_estimators': [80, 200, 400],
    'num_leaves': [20, 30, 40],
}

inner_cv = KFold(n_splits=3, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=8, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 986
[LightGBM] [Info] Number of data points in the train set: 33996, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113278 -> initscore=-2.057686
[LightGBM] [Info] Start training from score -2.057686
[LightGBM] [Info] Number of positive: 3826, number of negative: 30170
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 986
[LightGBM] [Info] Number of data points in the train set: 33996, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.112543 -> initscore=-2.065028
[LightGBM] [

In [113]:
model = LGBMClassifier()
gridParams = {
    'learning_rate': [0.02, 0.1],
    'n_estimators': [80, 200],
    'num_leaves': [20, 30],
}

inner_cv = KFold(n_splits=3, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

print(f"Mean Best Score: {mean_best_score:.4f} ± {std_best_score:.4f}")
print("Best parameters per fold:", best_params)

[LightGBM] [Info] Number of positive: 3330, number of negative: 26272
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 29602, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.112492 -> initscore=-2.065531
[LightGBM] [Info] Start training from score -2.065531
[LightGBM] [Info] Number of positive: 3330, number of negative: 26273
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001900 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 981
[LightGBM] [Info] Number of data points in the train set: 29603, number of used features: 16
[LightGBM] [Info] [bin

In [117]:
optim_lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['l2', 'auc'],
    'num_leaves': 30,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'n_estimators': 80 
}

# lgbm_model = LGBMClassifier(learning_rate=0.1, n_estimators=80, num_leaves=30)

# lgbm_model.fit(X_train, y_train)

# y_pred = lgbm_model.predict(X_test)

# optim_accuracy = accuracy_score(y_test, y_pred)

# optim_accuracy

In [118]:
lgbm_model = lgb.train(
    optim_lgbm_params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[75]	training's l2: 0.0503118	training's auc: 0.95715	valid_1's l2: 0.0608292	valid_1's auc: 0.936614


In [119]:
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
lgbm_accuracy

0.9291264103776772

In [None]:
model = RandomForestClassifier()

gridParams = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=527)
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=527)

best_scores = []
best_params = []

for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params

In [None]:
model = HistGradientBoostingClassifier()

# Grid parameters for HistGradientBoostingClassifier
gridParams = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100, 200],
    'max_leaf_nodes': [20, 30, 40],
    'min_samples_leaf': [10, 20, 30]
}

# Inner and outer cross-validation settings
inner_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing
grid = GridSearchCV(model, gridParams, cv=inner_cv, n_jobs=-1)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=527)  # Reduced number of splits for quicker processing

best_scores = []
best_params = []

# Nested cross-validation
for train_idx, test_idx in outer_cv.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    grid.fit(X_train, y_train)
    best_scores.append(grid.best_score_)
    best_params.append(grid.best_params_)

# Aggregate and print the results
mean_best_score = np.mean(best_scores)
std_best_score = np.std(best_scores)

mean_best_score, std_best_score, best_params
