In [1]:
import pandas as pd
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Ensure the path to the DEModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/de_model"))
from de_handler import DEModelHandler  

# Ensure the path to the FSDModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/fsd_model"))
from fsd_handler import FSDModelHandler  

# Ensure the path to the Math3ModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/math3_model"))
from math3_handler import Math3ModelHandler  

# Ensure the path to the PythonModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/python_model"))
from python_handler import PythonModelHandler  

df = pd.read_csv("../../dataset/train_dataset.csv")

# Drop the irrelevant, data leak columns
df_clean = df.drop(
    columns=[
        "Student ID",
        "Mentor-1",
        "Mentor-2",
        "Mentor-3",
        "Roll-2",
        "Roll-3",
        "Math-3 Theory",
        "DE Theory",
        "DE Practical",
        "FSD Theory",
        "FSD Practical",
        "Python Theory",
        "Python Practical",
        "Communication Theory",
        "Law Theory",
    ]
)

# columns for Semester 1 core subjects
sem1_columns = [
    "Math-1 Theory",
    "Physics Theory",
    "Java-1 Theory",
    "Software Engineering Theory",
]

# Calculate Semester 1 Percentage as the average of core subject scores
# scores are numerical and out of 100
df_clean["Sem 1 Percentage"] = df_clean[sem1_columns].mean(axis=1).round(2)

# columns for Semester 2 core subjects
sem2_columns = [
    "Math-2 Theory",
    "Data Structures using Java Theory",
    "DBMS Theory",
    "Fundamental of Electronics and Electrical Theory",
    "Java-2 Theory",
]

# Calculate Semester 2 Percentage as the average of core subject scores
# scores are numerical and out of 100
df_clean["Sem 2 Percentage"] = df_clean[sem2_columns].mean(axis=1).round(2)

# Rename columns Div-1, Div-2, Div-3 to Section-1, Section-2, Section-3
df_clean = df_clean.rename(
    columns={"Div-1": "Section-1", "Div-2": "Section-2", "Div-3": "Section-3"}
)

# Transform values in Section-1, Section-2, Section-3 to keep only the first character
# Thus we get Only Department
for section in ["Section-1", "Section-2", "Section-3"]:
    df_clean[section] = df_clean[section].str[0]

# adding DE predicted column
preprocessor = DEModelHandler()
fe_de = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/de_model/de_model.joblib",
    return_type="df"
)

# Add the predicted DE Theory marks to df_clean
df_clean["Predicted DE Theory"] = fe_de["Predicted DE Theory"]


# adding FSD predicted column
preprocessor = FSDModelHandler()
fe_fsd = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/fsd_model/fsd_model.joblib",
    return_type="df"
)

# Add the predicted FSD Theory marks to df_clean
df_clean["Predicted FSD Theory"] = fe_fsd["Predicted FSD Theory"]


# adding Math3 predicted column
preprocessor = Math3ModelHandler()
fe_math3 = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/math3_model/math3_model.joblib",
    return_type="df"
)

# Add the predicted Math3 Theory marks to df_clean
df_clean["Predicted Math-3 Theory"] = fe_math3["Predicted Math-3 Theory"]


# adding Python predicted column
preprocessor = PythonModelHandler()
fe_python = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/python_model/python_model.joblib",
    return_type="df"
)

# Add the predicted Python Theory marks to df_clean
df_clean["Predicted Python Theory"] = fe_python["Predicted Python Theory"]

#  Calculate predicted Semester 3 percentage (mean of 4 predicted subject marks)
sem3_subjects = [
    "Predicted Math-3 Theory",
    "Predicted DE Theory",
    "Predicted FSD Theory",
    "Predicted Python Theory",
]

df_clean["Predicted Sem 3 Percentage"] = df_clean[sem3_subjects].mean(axis=1).round(2)

df_clean["Sem 1 Percentile"] = df_clean["Sem 1 Percentage"].rank(pct=True) * 100
df_clean["Sem 2 Percentile"] = df_clean["Sem 2 Percentage"].rank(pct=True) * 100
df_clean["Predicted Sem 3 Percentile"] = df_clean["Predicted Sem 3 Percentage"].rank(pct=True) * 100

# Round for consistency
df_clean[["Sem 1 Percentile", "Sem 2 Percentile", "Predicted Sem 3 Percentile"]] = df_clean[
    ["Sem 1 Percentile", "Sem 2 Percentile", "Predicted Sem 3 Percentile"]
].round(2)

df_clean["Predicted Percentile Drop"] = (
    df_clean["Sem 2 Percentile"] - df_clean["Predicted Sem 3 Percentile"]
).round(2)

df_clean["Predicted Risk Flag"] = df_clean["Predicted Percentile Drop"] > 10

# Columns for Semester 3 core theory subjects
sem3_columns = [
    "Math-3 Theory",
    "DE Theory",
    "FSD Theory",
    "Python Theory",
]

# Calculate Semester 3 Total as the sum of core subject scores
df["Sem 3 Percentage"] = (df[sem3_columns].sum(axis=1) / 4).round(2)

df_clean["Sem 3 Percentile"] = df["Sem 3 Percentage"].rank(pct=True) * 100

df_clean["Percentile Drop"] = (
    df_clean["Sem 2 Percentile"] - df_clean["Sem 3 Percentile"]
).round(2)

df_clean["Risk Flag"] = df_clean["Percentile Drop"] > 10

columns_to_drop = [
    "Sem 3 Percentile",
    "Percentile Drop"
]

df_clean.drop(columns=columns_to_drop, inplace=True)

# After all operations on df_clean are complete, drop other DataFrames
df = None
fe_de = None
fe_fsd = None
fe_math3 = None
fe_python = None

print(df_clean.head())

  Gender Religion Branch Section-1 Section-2 Section-3  Roll-1  Math-1 Theory  \
0      M    Hindu     CE         D         D         A     350             47   
1      F    Hindu    CST         B         B         D      18             84   
2      F    Hindu   AIML         A         A         C      23             74   
3      M    Hindu    CST         B         B         D     212             55   
4      M    Hindu    CST         B         B         D     208             38   

   Physics Theory  Physics Practical  ...  Predicted FSD Theory  \
0              48                 75  ...             72.266535   
1              83                 81  ...             87.523458   
2              85                 86  ...             89.409752   
3              69                 82  ...             79.807055   
4              59                 74  ...             56.474296   

   Predicted Math-3 Theory  Predicted Python Theory  \
0                56.352210                71.642156   


# Dummy

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

# Target and features
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# DummyClassifier – always predicts the most frequent class
dummy = DummyClassifier(strategy='most_frequent')

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'DummyClassifier-MostFreq'
model_desc = 'Baseline-MostFrequent-5Fold'

# Print formatted summary
print("\n--- Baseline Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV (append row, create file if not exists)
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to file (header only if file doesn't exist)
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 2: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 3: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 4: Accuracy=0.7931, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 5: Accuracy=0.7986, Precision=0.0000, Recall=0.0000, F1=0.0000

--- Baseline Model Summary ---
Name                          : DummyClassifier-MostFreq
Description                   : Baseline-MostFrequent-5Fold
Accuracy                      : 0.7983
Precision                     : 0.0000
Recall                        : 0.0000
F1 Score                      : 0.0000

CSV Row Format:
DummyClassifier-MostFreq,Baseline-MostFrequent-5Fold,0.7983,0.0000,0.0000,0.0000


# Logistic Regression

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)  # convert bool to 0/1

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Full pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LogisticRegression(class_weight='balanced', max_iter=1000))
])

# Step 5: 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Step 6: Loop through CV
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'LogisticRegression-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Print: CSV-style with labels and formatting
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])

Fold 1: Accuracy=0.6552, Precision=0.2857, Recall=0.4828, F1=0.3590
Fold 2: Accuracy=0.7310, Precision=0.3611, Recall=0.4483, F1=0.4000
Fold 3: Accuracy=0.6345, Precision=0.2931, Recall=0.5862, F1=0.3908
Fold 4: Accuracy=0.6345, Precision=0.2982, Recall=0.5667, F1=0.3908
Fold 5: Accuracy=0.5486, Precision=0.2000, Recall=0.4138, F1=0.2697

--- Average Metrics Summary ---
Name                          : LogisticRegression-Balanced
Description                   : OneHot+Scaler+5Fold-Stratified
Accuracy                      : 0.6408
Precision                     : 0.2876
Recall                        : 0.4995
F1 Score                      : 0.3620

CSV Row Format:
LogisticRegression-Balanced,OneHot+Scaler+5Fold-Stratified,0.6408,0.2876,0.4995,0.3620


In [53]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

current_df = df_clean.copy()

# X, y split after feature filtering
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Pipeline with SMOTE and Logistic Regression
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=1000))
])

# CV evaluation (same as before)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc_list.append(accuracy_score(y_val, y_pred))
    prec_list.append(precision_score(y_val, y_pred, zero_division=0))
    rec_list.append(recall_score(y_val, y_pred))
    f1_list.append(f1_score(y_val, y_pred))

    print(f"Fold {fold}: Accuracy={acc_list[-1]:.4f}, Precision={prec_list[-1]:.4f}, Recall={rec_list[-1]:.4f}, F1={f1_list[-1]:.4f}")

# Averages
print("\n--- Final Metrics ---")
print(f"Accuracy: {np.mean(acc_list):.4f}")
print(f"Precision: {np.mean(prec_list):.4f}")
print(f"Recall: {np.mean(rec_list):.4f}")
print(f"F1 Score: {np.mean(f1_list):.4f}")


Fold 1: Accuracy=0.6483, Precision=0.2708, Recall=0.4483, F1=0.3377
Fold 2: Accuracy=0.7310, Precision=0.3684, Recall=0.4828, F1=0.4179
Fold 3: Accuracy=0.6414, Precision=0.2982, Recall=0.5862, F1=0.3953
Fold 4: Accuracy=0.6690, Precision=0.3548, Recall=0.7333, F1=0.4783
Fold 5: Accuracy=0.5417, Precision=0.1967, Recall=0.4138, F1=0.2667

--- Final Metrics ---
Accuracy: 0.6463
Precision: 0.2978
Recall: 0.5329
F1 Score: 0.3792


# DecisionTreeClassifier

In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'DecisionTreeClassifier-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- DecisionTreeClassifier Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7034, Precision=0.2083,  Recall=0.1724, F1=0.1887
Fold 2: Accuracy=0.7586, Precision=0.3500,  Recall=0.2414, F1=0.2857
Fold 3: Accuracy=0.7655, Precision=0.3913,  Recall=0.3103, F1=0.3462
Fold 4: Accuracy=0.6828, Precision=0.3000,  Recall=0.4000, F1=0.3429
Fold 5: Accuracy=0.6806, Precision=0.1852,  Recall=0.1724, F1=0.1786

--- DecisionTreeClassifier Summary ---
Mean Accuracy : 0.7182
Mean Precision: 0.2870
Mean Recall   : 0.2593
Mean F1 Score : 0.2684

CSV Row Format:
DecisionTreeClassifier-Balanced,OneHot+Scaler+5Fold-Stratified,0.7182,0.2870,0.2593,0.2684


In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Recall-tuned DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

# Step 5: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 6: Custom threshold
threshold = 0.35

# Step 7: Cross-validation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)

    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 8: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 9: Model info
model_name = 'DecisionTree-RecallTuned'
model_desc = f'Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold'

# Console output
print("\n--- DecisionTree_Recall_Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.5793, Precision=0.2714,  Recall=0.6552, F1=0.3838
Fold 2: Accuracy=0.6207, Precision=0.2759,  Recall=0.5517, F1=0.3678
Fold 3: Accuracy=0.5517, Precision=0.2805,  Recall=0.7931, F1=0.4144
Fold 4: Accuracy=0.5310, Precision=0.2889,  Recall=0.8667, F1=0.4333
Fold 5: Accuracy=0.5208, Precision=0.1970,  Recall=0.4483, F1=0.2737

--- DecisionTree_Recall_Tuned Summary ---
Mean Accuracy : 0.5607
Mean Precision: 0.2627
Mean Recall   : 0.6630
Mean F1 Score : 0.3746

CSV Row Format:
DecisionTree-RecallTuned,Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold,0.5607,0.2627,0.6630,0.3746


In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Flexible (deep) DecisionTreeClassifier
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        max_depth=None,             # no limit
        min_samples_split=2,        # fine splits
        min_samples_leaf=1,         # small leaves allowed
        random_state=42
    ))
])

# Step 4: Cross-validation config
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 5: Threshold
threshold = 0.25  # aggressive threshold to maximize recall

# Step 6: CV loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Aggregate results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 8: Metadata
model_name = 'DecisionTree-MaxRecall'
model_desc = 'Depth=None|Split=2|Leaf=1|Thresh=0.25|5Fold'

print("\n--- DecisionTree_MaxRecall Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 9: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7034, Precision=0.2083,  Recall=0.1724, F1=0.1887
Fold 2: Accuracy=0.7586, Precision=0.3500,  Recall=0.2414, F1=0.2857
Fold 3: Accuracy=0.7655, Precision=0.3913,  Recall=0.3103, F1=0.3462
Fold 4: Accuracy=0.6828, Precision=0.3000,  Recall=0.4000, F1=0.3429
Fold 5: Accuracy=0.6806, Precision=0.1852,  Recall=0.1724, F1=0.1786

--- DecisionTree_MaxRecall Summary ---
Mean Accuracy : 0.7182
Mean Precision: 0.2870
Mean Recall   : 0.2593
Mean F1 Score : 0.2684

CSV Row Format:
DecisionTree-MaxRecall,Depth=None|Split=2|Leaf=1|Thresh=0.25|5Fold,0.7182,0.2870,0.2593,0.2684


In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SMOTE setup
smote = SMOTE(random_state=42)

# Step 5: Recall-tuned DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

# Step 6: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 7: Custom threshold
threshold = 0.35

# Step 8: Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE to training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)

    # Fit the model on SMOTE-resampled data
    pipeline.named_steps['model'].fit(X_train_smote, y_train_smote)

    # Transform validation data
    X_val_preprocessed = preprocessor.transform(X_val)
    y_proba = pipeline.named_steps['model'].predict_proba(X_val_preprocessed)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 10: Model info
model_name = 'DecisionTree-SMOTE'
model_desc = f'Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold|SMOTE'

# Console output
print("\n--- DecisionTree_SMOTE Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 11: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.6552, Precision=0.2667, Recall=0.4138, F1=0.3243
Fold 2: Accuracy=0.6966, Precision=0.3333, Recall=0.5172, F1=0.4054
Fold 3: Accuracy=0.6069, Precision=0.2941, Recall=0.6897, F1=0.4124
Fold 4: Accuracy=0.6690, Precision=0.3594, Recall=0.7667, F1=0.4894
Fold 5: Accuracy=0.5486, Precision=0.2568, Recall=0.6552, F1=0.3689

--- DecisionTree_SMOTE Summary ---
Mean Accuracy : 0.6352
Mean Precision: 0.3020
Mean Recall   : 0.6085
Mean F1 Score : 0.4001

CSV Row Format:
DecisionTree-SMOTE,Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold|SMOTE,0.6352,0.3020,0.6085,0.4001


In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SMOTE setup with adjusted sampling strategy
smote = SMOTE(sampling_strategy=0.8, random_state=42)

# Step 5: Recall-optimized DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=3,
        random_state=42
    ))
])

# Step 6: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 7: Lowered threshold for higher recall
threshold = 0.25

# Step 8: Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE to training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)

    # Fit the model on SMOTE-resampled data
    pipeline.named_steps['model'].fit(X_train_smote, y_train_smote)

    # Transform validation data
    X_val_preprocessed = preprocessor.transform(X_val)
    y_proba = pipeline.named_steps['model'].predict_proba(X_val_preprocessed)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 10: Model info
model_name = 'DecisionTree-SMOTE-RecallOptimized'
model_desc = f'Thresh=0.25|Depth=6|Split=10|Leaf=3|5Fold|SMOTE_0.8'

# Console output
print("\n--- DecisionTree_SMOTE_RecallOptimized Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 11: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.6552, Precision=0.2766, Recall=0.4483, F1=0.3421
Fold 2: Accuracy=0.6828, Precision=0.3111, Recall=0.4828, F1=0.3784
Fold 3: Accuracy=0.5241, Precision=0.2561, Recall=0.7241, F1=0.3784
Fold 4: Accuracy=0.4897, Precision=0.2317, Recall=0.6333, F1=0.3393
Fold 5: Accuracy=0.5278, Precision=0.2532, Recall=0.6897, F1=0.3704

--- DecisionTree_SMOTE_RecallOptimized Summary ---
Mean Accuracy : 0.5759
Mean Precision: 0.2657
Mean Recall   : 0.5956
Mean F1 Score : 0.3617

CSV Row Format:
DecisionTree-SMOTE-RecallOptimized,Thresh=0.25|Depth=6|Split=10|Leaf=3|5Fold|SMOTE_0.8,0.5759,0.2657,0.5956,0.3617


# RandomForestClassifier 

In [9]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline with Random Forest
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'RandomForestClassifier-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- RandomForestClassifier Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.8069, Precision=0.6000,  Recall=0.1034, F1=0.1765
Fold 2: Accuracy=0.8000, Precision=0.0000,  Recall=0.0000, F1=0.0000
Fold 3: Accuracy=0.8069, Precision=1.0000,  Recall=0.0345, F1=0.0667
Fold 4: Accuracy=0.7655, Precision=0.1667,  Recall=0.0333, F1=0.0556
Fold 5: Accuracy=0.7917, Precision=0.3333,  Recall=0.0345, F1=0.0625

--- RandomForestClassifier Summary ---
Mean Accuracy : 0.7942
Mean Precision: 0.4200
Mean Recall   : 0.0411
Mean F1 Score : 0.0722

CSV Row Format:
RandomForestClassifier-Balanced,OneHot+Scaler+5Fold-Stratified,0.7942,0.4200,0.0411,0.0722


In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)  # convert bool to 0/1

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Full pipeline with SMOTE + RandomForest
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    ))
])

# Step 5: 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

threshold = 0.3  # Custom threshold to maximize recall

# Step 6: Loop through CV
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]  # Get probability for class 1

    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'RandomForest-SMOTE-Threshold0.3'
model_desc = 'OneHot+Scaler+SMOTE+RF+Threshold=0.3'

# Print: CSV-style with labels and formatting
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])

Fold 1: Accuracy=0.6483, Precision=0.3103, Recall=0.6207, F1=0.4138
Fold 2: Accuracy=0.6414, Precision=0.2653, Recall=0.4483, F1=0.3333
Fold 3: Accuracy=0.6069, Precision=0.3158, Recall=0.8276, F1=0.4571
Fold 4: Accuracy=0.5793, Precision=0.3086, Recall=0.8333, F1=0.4505
Fold 5: Accuracy=0.5625, Precision=0.2703, Recall=0.6897, F1=0.3883

--- Average Metrics Summary ---
Name                          : RandomForest-SMOTE-Threshold0.3
Description                   : OneHot+Scaler+SMOTE+RF+Threshold=0.3
Accuracy                      : 0.6077
Precision                     : 0.2941
Recall                        : 0.6839
F1 Score                      : 0.4086

CSV Row Format:
RandomForest-SMOTE-Threshold0.3,OneHot+Scaler+SMOTE+RF+Threshold=0.3,0.6077,0.2941,0.6839,0.4086


# XGBoost

In [11]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: XGBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=np.sum(y == 0) / np.sum(y == 1),  # Handles class imbalance
        use_label_encoder=False,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'XGBoost-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7517, Precision=0.2941,  Recall=0.1724, F1=0.2174
Fold 2: Accuracy=0.7793, Precision=0.3333,  Recall=0.1034, F1=0.1579
Fold 3: Accuracy=0.7379, Precision=0.3333,  Recall=0.3103, F1=0.3214
Fold 4: Accuracy=0.7655, Precision=0.4000,  Recall=0.2667, F1=0.3200
Fold 5: Accuracy=0.7153, Precision=0.3125,  Recall=0.3448, F1=0.3279

--- XGBoost Summary ---
Mean Accuracy : 0.7500
Mean Precision: 0.3347
Mean Recall   : 0.2395
Mean F1 Score : 0.2689

CSV Row Format:
XGBoost-Balanced,OneHot+Scaler+5Fold-Stratified,0.7500,0.3347,0.2395,0.2689


In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline with XGBoost + SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=4,  # 80:20 class balance
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 5: Stratified 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 6: Metrics storage
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Threshold for classification
threshold = 0.25

# Step 7: CV loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 8: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'XGBoost-SMOTE-Threshold0.25'
model_desc = 'OneHot+Scaler+SMOTE+XGB+Threshold=0.25'

# Step 9: Print metrics
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])

Fold 1: Accuracy=0.6897, Precision=0.2895, Recall=0.3793, F1=0.3284
Fold 2: Accuracy=0.6897, Precision=0.3182, Recall=0.4828, F1=0.3836
Fold 3: Accuracy=0.6621, Precision=0.3077, Recall=0.5517, F1=0.3951
Fold 4: Accuracy=0.6483, Precision=0.3279, Recall=0.6667, F1=0.4396
Fold 5: Accuracy=0.5903, Precision=0.2414, Recall=0.4828, F1=0.3218

--- Average Metrics Summary ---
Name                          : XGBoost-SMOTE-Threshold0.25
Description                   : OneHot+Scaler+SMOTE+XGB+Threshold=0.25
Accuracy                      : 0.6560
Precision                     : 0.2969
Recall                        : 0.5126
F1 Score                      : 0.3737

CSV Row Format:
XGBoost-SMOTE-Threshold0.25,OneHot+Scaler+SMOTE+XGB+Threshold=0.25,0.6560,0.2969,0.5126,0.3737


In [13]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Pipeline with SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        random_state=42
    ))
])

# Step 4: Hyperparameter Optimization
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

# Step 5: Dynamic Threshold Tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
thresholds = np.linspace(0.1, 0.9, 9)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    # Dynamic threshold tuning for recall
    recalls = [recall_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(recalls)]

    y_pred = (y_proba >= best_threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Best Threshold={best_threshold:.2f}")

# Step 6: Aggregate Results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Summary
model_name = 'XGBoost-SMOTE-FineTuned'
model_desc = f'OptimizedParams|DynamicThresh={best_threshold:.2f}|OneHot+Scaler'

print("\n--- Fine-Tuned XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.01, 'model__max_depth': 4, 'model__min_child_weight': 5, 'model__n_estimators': 100, 'model__subsample': 0.8}
Fold 1: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Best Threshold=0.10
Fold 2: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Best Threshold=0.10
Fold 3: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Best Threshold=0.10
Fold 4: Accuracy=0.2069, Precision=0.2069, Recall=1.0000, F1=0.3429, Best Threshold=0.10
Fold 5: Accuracy=0.2014, Precision=0.2014, Recall=1.0000, F1=0.3353, Best Threshold=0.10

--- Fine-Tuned XGBoost Summary ---
Mean Accuracy : 0.2017
Mean Precision: 0.2017
Mean Recall   : 1.0000
Mean F1 Score : 0.3356


In [14]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Pipeline with SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        random_state=42
    ))
])

# Step 4: Hyperparameter Optimization
param_grid = {
    'model__n_estimators': [200, 300, 400],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__scale_pos_weight': [4, 5, 6]  # Adjusted for imbalance
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='balanced_accuracy',  # Balances true positive/negative rates
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

# Step 5: Dynamic Threshold Tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
thresholds = np.linspace(0.1, 0.9, 9)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    # Dynamic threshold tuning for F1
    f1_scores = [f1_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(f1_scores)]

    y_pred = (y_proba >= best_threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Best Threshold={best_threshold:.2f}")

# Step 6: Aggregate Results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Summary
model_name = 'XGBoost-SMOTE-FineTuned-Balanced'
model_desc = f'OptimizedParams|DynamicThresh={best_threshold:.2f}|OneHot+Scaler'

print("\n--- Fine-Tuned XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Parameters: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.01, 'model__max_depth': 4, 'model__min_child_weight': 5, 'model__n_estimators': 300, 'model__scale_pos_weight': 5, 'model__subsample': 0.8}
Fold 1: Accuracy=0.5517, Precision=0.2750, Recall=0.7586, F1=0.4037, Best Threshold=0.50
Fold 2: Accuracy=0.5793, Precision=0.3000, Recall=0.8276, F1=0.4404, Best Threshold=0.40
Fold 3: Accuracy=0.7310, Precision=0.3864, Recall=0.5862, F1=0.4658, Best Threshold=0.80
Fold 4: Accuracy=0.6069, Precision=0.3373, Recall=0.9333, F1=0.4956, Best Threshold=0.60
Fold 5: Accuracy=0.4861, Precision=0.2632, Recall=0.8621, F1=0.4032, Best Threshold=0.40

--- Fine-Tuned XGBoost Summary ---
Mean Accuracy : 0.5910
Mean Precision: 0.3124
Mean Recall   : 0.7936
Mean F1 Score : 0.4417


# LightGBM

In [15]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline (optimized to suppress warnings)
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        min_split_gain=0.01,
        min_child_samples=20,
        min_data_in_leaf=20,
        subsample=0.8,
        colsample_bytree=0.8,
        verbose=-1,              # suppress LightGBM internal logs
        random_state=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'LightGBM-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified+VerboseOff'

# Console summary
print("\n--- LightGBM Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7310, Precision=0.2727,  Recall=0.2069, F1=0.2353
Fold 2: Accuracy=0.7586, Precision=0.2500,  Recall=0.1034, F1=0.1463
Fold 3: Accuracy=0.7793, Precision=0.4444,  Recall=0.4138, F1=0.4286
Fold 4: Accuracy=0.7103, Precision=0.3125,  Recall=0.3333, F1=0.3226
Fold 5: Accuracy=0.6944, Precision=0.2857,  Recall=0.3448, F1=0.3125

--- LightGBM Summary ---
Mean Accuracy : 0.7348
Mean Precision: 0.3131
Mean Recall   : 0.2805
Mean F1 Score : 0.2891

CSV Row Format:
LightGBM-Balanced,OneHot+Scaler+5Fold-Stratified+VerboseOff,0.7348,0.3131,0.2805,0.2891


In [16]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Integer, Real

# Load your real df_clean before this step
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(objective='binary', class_weight='balanced', verbose=-1, random_state=42))
])

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

param_space = {
    'model__n_estimators': Integer(100, 500),
    'model__max_depth': Integer(3, 12),
    'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'model__min_child_samples': Integer(10, 100),
    'model__min_split_gain': Real(0.0, 0.2),
    'model__subsample': Real(0.6, 1.0),
    'model__colsample_bytree': Real(0.6, 1.0)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring=scoring,
    refit='recall',
    n_iter=40,
    cv=cv,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

opt.fit(X, y)

best_model = opt.best_estimator_
cv_results = cross_validate(best_model, X, y, cv=cv, scoring=scoring)

print("Final Tuned LightGBM Model Scores:")
print(f"Accuracy : {np.mean(cv_results['test_accuracy']):.4f}")
print(f"Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"Recall   : {np.mean(cv_results['test_recall']):.4f}")
print(f"F1 Score : {np.mean(cv_results['test_f1']):.4f}")
print("Best Parameters:", opt.best_params_)

Final Tuned LightGBM Model Scores:
Accuracy : 0.6491
Precision: 0.3264
Recall   : 0.6908
F1 Score : 0.4381
Best Parameters: OrderedDict({'model__colsample_bytree': 0.8531155960191492, 'model__learning_rate': 0.01, 'model__max_depth': 12, 'model__min_child_samples': 100, 'model__min_split_gain': 0.2, 'model__n_estimators': 100, 'model__subsample': 0.6})


In [17]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    precision_recall_curve
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        learning_rate=0.01,
        max_depth=12,
        min_child_samples=100,
        min_split_gain=0.2,
        n_estimators=100,
        subsample=0.8664,
        colsample_bytree=1.0,
        verbose=-1,
        random_state=42
    ))
])

# Step 5: Cross-validation predictions
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_probs = cross_val_predict(pipeline, X, y, cv=cv, method='predict_proba')[:, 1]
y_true = y.copy()  # true labels for all folds

# Step 6: Find optimal threshold
precisions, recalls, thresholds = precision_recall_curve(y_true, y_probs)
valid = [(p, r, t) for p, r, t in zip(precisions, recalls, thresholds) if r >= 0.85 and p > 0.50]

if valid:
    best_prec, best_rec, best_thresh = max(valid, key=lambda x: 2*x[0]*x[1]/(x[0]+x[1]))
else:
    best_thresh = 0.5  # fallback
    best_prec = precision_score(y_true, y_probs >= best_thresh, zero_division=0)
    best_rec = recall_score(y_true, y_probs >= best_thresh)
    best_f1 = f1_score(y_true, y_probs >= best_thresh)
    print("No threshold met all conditions. Using default 0.5.")

# Step 7: Final metrics at optimal threshold
y_pred_final = (y_probs >= best_thresh).astype(int)
final_acc = accuracy_score(y_true, y_pred_final)
final_prec = precision_score(y_true, y_pred_final, zero_division=0)
final_rec = recall_score(y_true, y_pred_final)
final_f1 = f1_score(y_true, y_pred_final)

# Step 8: Print results
print("\n--- Threshold-Tuned LightGBM Results ---")
print(f"Threshold   : {best_thresh:.4f}")
print(f"Accuracy    : {final_acc:.4f}")
print(f"Precision   : {final_prec:.4f}")
print(f"Recall      : {final_rec:.4f}")
print(f"F1 Score    : {final_f1:.4f}")

# Step 9: Save to CSV
model_name = 'LightGBM-Tuned-Threshold'
model_desc = 'BayesOpt+Threshold@{:.4f}'.format(best_thresh)
csv_file = "risk_model_metrics.csv"

result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(final_acc, 4),
    'Precision': round(final_prec, 4),
    'Recall': round(final_rec, 4),
    'F1 Score': round(final_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

No threshold met all conditions. Using default 0.5.

--- Threshold-Tuned LightGBM Results ---
Threshold   : 0.5000
Accuracy    : 0.6464
Precision   : 0.3179
Recall      : 0.6575
F1 Score    : 0.4286


In [18]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline with tuned parameters
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        n_estimators=200,           # Increased to allow more learning
        max_depth=8,                # Slightly deeper trees
        learning_rate=0.05,         # Lower for better convergence
        min_split_gain=0.01,
        min_child_samples=10,       # Lowered to capture smaller patterns
        min_data_in_leaf=10,        # Lowered to reduce overfitting
        subsample=0.8,
        colsample_bytree=0.7,      # Slightly reduced to increase diversity
        scale_pos_weight=3,         # Increase to prioritize positive class (tune based on imbalance)
        verbose=-1,
        random_state=42
    ))
])

# Step 5: Cross-validation with threshold tuning
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Get probability scores for threshold tuning
    y_prob = pipeline.predict_proba(X_val)[:, 1]

    # Find optimal threshold for recall >= 0.85
    precisions, recalls, thresholds = precision_recall_curve(y_val, y_prob)
    threshold = thresholds[np.argmax(recalls >= 0.85)] if np.any(recalls >= 0.85) else 0.5

    # Apply threshold to predictions
    y_pred = (y_prob >= threshold).astype(int)

    # Calculate metrics
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Threshold={threshold:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'LightGBM-Tuned-HighRecall'
model_desc = 'OneHot+Scaler+5Fold-Stratified+ThresholdTuned+VerboseOff'

# Console summary
print("\n--- LightGBM Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0002
Fold 2: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0002
Fold 3: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0002
Fold 4: Accuracy=0.2069, Precision=0.2069, Recall=1.0000, F1=0.3429, Threshold=0.0002
Fold 5: Accuracy=0.2014, Precision=0.2014, Recall=1.0000, F1=0.3353, Threshold=0.0002

--- LightGBM Tuned Summary ---
Mean Accuracy : 0.2017
Mean Precision: 0.2017
Mean Recall   : 1.0000
Mean F1 Score : 0.3356

CSV Row Format:
LightGBM-Tuned-HighRecall,OneHot+Scaler+5Fold-Stratified+ThresholdTuned+VerboseOff,0.2017,0.2017,1.0000,0.3356


In [19]:
import os
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Classifier wrapper for threshold tuning
class ThresholdLGBMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, **params):
        self.model = LGBMClassifier(**params)
        self.threshold = 0.5

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        probas = self.model.predict_proba(X)[:, 1]
        return (probas >= self.threshold).astype(int)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

# Step 5: Objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 80),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "class_weight": "balanced",
        "random_state": 42,
        "verbose": -1
    }

    model = ThresholdLGBMClassifier(**params)
    pipeline = Pipeline([('prep', preprocessor), ('clf', model)])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    recalls = []

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        probas = pipeline.predict_proba(X_val)[:, 1]

        # Find best threshold to maximize recall >= 0.85
        best_recall, best_thresh = 0, 0.5
        for thresh in np.arange(0.3, 0.8, 0.02):
            preds = (probas >= thresh).astype(int)
            rec = recall_score(y_val, preds)
            if rec > best_recall:
                best_recall, best_thresh = rec, thresh

        model.threshold = best_thresh
        preds = (probas >= best_thresh).astype(int)

        recalls.append(recall_score(y_val, preds))

    return np.mean(recalls)

# Step 6: Tune with Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)
best_params = study.best_trial.params

# Step 7: Final Evaluation
model = ThresholdLGBMClassifier(**best_params)
pipeline = Pipeline([('prep', preprocessor), ('clf', model)])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    probas = pipeline.predict_proba(X_val)[:, 1]

    # Best threshold for this fold
    best_thresh, best_f1 = 0.5, 0
    for thresh in np.arange(0.3, 0.8, 0.01):
        preds = (probas >= thresh).astype(int)
        rec = recall_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        f1_val = f1_score(y_val, preds)
        if rec >= 0.85 and prec > 0.5 and f1_val > best_f1:
            best_f1, best_thresh = f1_val, thresh

    model.threshold = best_thresh
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Threshold={best_thresh:.2f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

print("\n--- Final LightGBM Optimized ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# CSV logging
model_name = 'LightGBM-Optuna-Threshold'
model_desc = 'Optuna+ThresholdTuning+5Fold'
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

[I 2025-07-02 04:58:21,463] A new study created in memory with name: no-name-7304f115-d740-4b32-9eb4-eb75a2158218
Best trial: 0. Best value: 0.341839:   2%|▏         | 1/50 [00:00<00:42,  1.15it/s]

[I 2025-07-02 04:58:22,333] Trial 0 finished with value: 0.34183908045977013 and parameters: {'n_estimators': 159, 'learning_rate': 0.13627776146013754, 'max_depth': 6, 'num_leaves': 25, 'min_child_samples': 29, 'subsample': 0.8261631002750012, 'colsample_bytree': 0.6643099694045081}. Best is trial 0 with value: 0.34183908045977013.


Best trial: 1. Best value: 0.355632:   4%|▍         | 2/50 [00:02<00:50,  1.05s/it]

[I 2025-07-02 04:58:23,505] Trial 1 finished with value: 0.35563218390804596 and parameters: {'n_estimators': 205, 'learning_rate': 0.09436637336110772, 'max_depth': 12, 'num_leaves': 66, 'min_child_samples': 39, 'subsample': 0.8187424444656307, 'colsample_bytree': 0.8408100191337881}. Best is trial 1 with value: 0.35563218390804596.


Best trial: 1. Best value: 0.355632:   6%|▌         | 3/50 [00:03<00:51,  1.10s/it]

[I 2025-07-02 04:58:24,666] Trial 2 finished with value: 0.3218390804597701 and parameters: {'n_estimators': 242, 'learning_rate': 0.14276325580656257, 'max_depth': 11, 'num_leaves': 34, 'min_child_samples': 43, 'subsample': 0.8829354050223435, 'colsample_bytree': 0.6487889898305834}. Best is trial 1 with value: 0.35563218390804596.


Best trial: 3. Best value: 0.615862:   8%|▊         | 4/50 [00:04<00:46,  1.01s/it]

[I 2025-07-02 04:58:25,536] Trial 3 finished with value: 0.6158620689655172 and parameters: {'n_estimators': 132, 'learning_rate': 0.043145844491144954, 'max_depth': 10, 'num_leaves': 27, 'min_child_samples': 39, 'subsample': 0.9554534436028321, 'colsample_bytree': 0.9062852834323648}. Best is trial 3 with value: 0.6158620689655172.


Best trial: 3. Best value: 0.615862:  10%|█         | 5/50 [00:05<00:50,  1.13s/it]

[I 2025-07-02 04:58:26,888] Trial 4 finished with value: 0.5333333333333333 and parameters: {'n_estimators': 107, 'learning_rate': 0.12674304130723468, 'max_depth': 4, 'num_leaves': 52, 'min_child_samples': 46, 'subsample': 0.7591321936530777, 'colsample_bytree': 0.6128563389841789}. Best is trial 3 with value: 0.6158620689655172.


Best trial: 3. Best value: 0.615862:  12%|█▏        | 6/50 [00:07<01:03,  1.43s/it]

[I 2025-07-02 04:58:28,904] Trial 5 finished with value: 0.444367816091954 and parameters: {'n_estimators': 197, 'learning_rate': 0.04654568906402055, 'max_depth': 5, 'num_leaves': 61, 'min_child_samples': 14, 'subsample': 0.9564164715766118, 'colsample_bytree': 0.9452741680447155}. Best is trial 3 with value: 0.6158620689655172.


Best trial: 3. Best value: 0.615862:  14%|█▍        | 7/50 [00:08<01:00,  1.40s/it]

[I 2025-07-02 04:58:30,235] Trial 6 finished with value: 0.3625287356321839 and parameters: {'n_estimators': 211, 'learning_rate': 0.13391752548699587, 'max_depth': 7, 'num_leaves': 58, 'min_child_samples': 41, 'subsample': 0.8904134589036165, 'colsample_bytree': 0.8906460861661489}. Best is trial 3 with value: 0.6158620689655172.


Best trial: 3. Best value: 0.615862:  16%|█▌        | 8/50 [00:10<01:08,  1.62s/it]

[I 2025-07-02 04:58:32,339] Trial 7 finished with value: 0.40344827586206894 and parameters: {'n_estimators': 177, 'learning_rate': 0.03641315859121517, 'max_depth': 8, 'num_leaves': 29, 'min_child_samples': 11, 'subsample': 0.6157297294624066, 'colsample_bytree': 0.7238018075182002}. Best is trial 3 with value: 0.6158620689655172.


Best trial: 3. Best value: 0.615862:  18%|█▊        | 9/50 [00:12<01:01,  1.49s/it]

[I 2025-07-02 04:58:33,543] Trial 8 finished with value: 0.36206896551724144 and parameters: {'n_estimators': 223, 'learning_rate': 0.11487576818067502, 'max_depth': 8, 'num_leaves': 40, 'min_child_samples': 40, 'subsample': 0.7417528281753791, 'colsample_bytree': 0.7310302808892776}. Best is trial 3 with value: 0.6158620689655172.


Best trial: 3. Best value: 0.615862:  20%|██        | 10/50 [00:13<00:56,  1.41s/it]

[I 2025-07-02 04:58:34,762] Trial 9 finished with value: 0.4241379310344827 and parameters: {'n_estimators': 107, 'learning_rate': 0.09586917446086113, 'max_depth': 9, 'num_leaves': 37, 'min_child_samples': 31, 'subsample': 0.733094975499514, 'colsample_bytree': 0.9916072958726339}. Best is trial 3 with value: 0.6158620689655172.


Best trial: 10. Best value: 0.629425:  22%|██▏       | 11/50 [00:14<00:57,  1.48s/it]

[I 2025-07-02 04:58:36,412] Trial 10 finished with value: 0.6294252873563219 and parameters: {'n_estimators': 293, 'learning_rate': 0.010364737533907029, 'max_depth': 10, 'num_leaves': 80, 'min_child_samples': 24, 'subsample': 0.9998339481000317, 'colsample_bytree': 0.8130826462936027}. Best is trial 10 with value: 0.6294252873563219.


Best trial: 10. Best value: 0.629425:  24%|██▍       | 12/50 [00:16<00:59,  1.56s/it]

[I 2025-07-02 04:58:38,144] Trial 11 finished with value: 0.5811494252873564 and parameters: {'n_estimators': 296, 'learning_rate': 0.011906556460350971, 'max_depth': 10, 'num_leaves': 76, 'min_child_samples': 23, 'subsample': 0.9945345112002202, 'colsample_bytree': 0.8104769680194064}. Best is trial 10 with value: 0.6294252873563219.


Best trial: 10. Best value: 0.629425:  26%|██▌       | 13/50 [00:18<01:00,  1.63s/it]

[I 2025-07-02 04:58:39,938] Trial 12 finished with value: 0.30068965517241375 and parameters: {'n_estimators': 288, 'learning_rate': 0.051698689602942646, 'max_depth': 10, 'num_leaves': 79, 'min_child_samples': 23, 'subsample': 0.936352027532089, 'colsample_bytree': 0.8864168501157366}. Best is trial 10 with value: 0.6294252873563219.


Best trial: 13. Best value: 0.82092:  28%|██▊       | 14/50 [00:19<00:50,  1.40s/it] 

[I 2025-07-02 04:58:40,805] Trial 13 finished with value: 0.8209195402298851 and parameters: {'n_estimators': 150, 'learning_rate': 0.011942264759127703, 'max_depth': 12, 'num_leaves': 47, 'min_child_samples': 33, 'subsample': 0.9978211044347339, 'colsample_bytree': 0.7592869708201222}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  30%|███       | 15/50 [00:20<00:45,  1.31s/it]

[I 2025-07-02 04:58:41,917] Trial 14 finished with value: 0.6908045977011493 and parameters: {'n_estimators': 261, 'learning_rate': 0.011339715864860787, 'max_depth': 12, 'num_leaves': 45, 'min_child_samples': 31, 'subsample': 0.9951750387421777, 'colsample_bytree': 0.752479155864441}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  32%|███▏      | 16/50 [00:21<00:42,  1.26s/it]

[I 2025-07-02 04:58:43,042] Trial 15 finished with value: 0.35586206896551725 and parameters: {'n_estimators': 247, 'learning_rate': 0.06800257604670241, 'max_depth': 12, 'num_leaves': 47, 'min_child_samples': 34, 'subsample': 0.8912935198042182, 'colsample_bytree': 0.729698475160767}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  34%|███▍      | 17/50 [00:22<00:37,  1.15s/it]

[I 2025-07-02 04:58:43,935] Trial 16 finished with value: 0.7110344827586207 and parameters: {'n_estimators': 157, 'learning_rate': 0.017309160036320242, 'max_depth': 12, 'num_leaves': 46, 'min_child_samples': 34, 'subsample': 0.6589138193716573, 'colsample_bytree': 0.768076653335461}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  36%|███▌      | 18/50 [00:23<00:34,  1.07s/it]

[I 2025-07-02 04:58:44,829] Trial 17 finished with value: 0.6498850574712645 and parameters: {'n_estimators': 151, 'learning_rate': 0.02816616511520654, 'max_depth': 11, 'num_leaves': 54, 'min_child_samples': 35, 'subsample': 0.6495195473763238, 'colsample_bytree': 0.7675193504812903}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  38%|███▊      | 19/50 [00:24<00:29,  1.06it/s]

[I 2025-07-02 04:58:45,485] Trial 18 finished with value: 0.554022988505747 and parameters: {'n_estimators': 135, 'learning_rate': 0.06674931430176995, 'max_depth': 11, 'num_leaves': 20, 'min_child_samples': 48, 'subsample': 0.6774467005349321, 'colsample_bytree': 0.6956077647395321}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  40%|████      | 20/50 [00:24<00:26,  1.13it/s]

[I 2025-07-02 04:58:46,228] Trial 19 finished with value: 0.8209195402298851 and parameters: {'n_estimators': 179, 'learning_rate': 0.023274317881771634, 'max_depth': 3, 'num_leaves': 68, 'min_child_samples': 19, 'subsample': 0.6897546828738531, 'colsample_bytree': 0.7887034832909102}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  42%|████▏     | 21/50 [00:25<00:24,  1.19it/s]

[I 2025-07-02 04:58:46,968] Trial 20 finished with value: 0.7726436781609196 and parameters: {'n_estimators': 183, 'learning_rate': 0.031460131849839965, 'max_depth': 3, 'num_leaves': 62, 'min_child_samples': 16, 'subsample': 0.6997463462929113, 'colsample_bytree': 0.8558725879871181}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  44%|████▍     | 22/50 [00:26<00:23,  1.19it/s]

[I 2025-07-02 04:58:47,799] Trial 21 finished with value: 0.6901149425287356 and parameters: {'n_estimators': 181, 'learning_rate': 0.029573839221763504, 'max_depth': 4, 'num_leaves': 68, 'min_child_samples': 18, 'subsample': 0.6995545043047441, 'colsample_bytree': 0.851964746794313}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  46%|████▌     | 23/50 [00:27<00:21,  1.26it/s]

[I 2025-07-02 04:58:48,499] Trial 22 finished with value: 0.8002298850574713 and parameters: {'n_estimators': 179, 'learning_rate': 0.025266853321894045, 'max_depth': 3, 'num_leaves': 71, 'min_child_samples': 17, 'subsample': 0.780271511874257, 'colsample_bytree': 0.7928102397403849}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  48%|████▊     | 24/50 [00:27<00:21,  1.23it/s]

[I 2025-07-02 04:58:49,345] Trial 23 finished with value: 0.7041379310344827 and parameters: {'n_estimators': 137, 'learning_rate': 0.0579830068414369, 'max_depth': 3, 'num_leaves': 72, 'min_child_samples': 20, 'subsample': 0.7698107899829343, 'colsample_bytree': 0.7858323349122626}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  50%|█████     | 25/50 [00:28<00:21,  1.14it/s]

[I 2025-07-02 04:58:50,367] Trial 24 finished with value: 0.6770114942528735 and parameters: {'n_estimators': 168, 'learning_rate': 0.024497544730087334, 'max_depth': 5, 'num_leaves': 70, 'min_child_samples': 27, 'subsample': 0.7893482069062991, 'colsample_bytree': 0.8156869912184084}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  52%|█████▏    | 26/50 [00:29<00:21,  1.13it/s]

[I 2025-07-02 04:58:51,279] Trial 25 finished with value: 0.7181609195402299 and parameters: {'n_estimators': 192, 'learning_rate': 0.021753355566439828, 'max_depth': 4, 'num_leaves': 58, 'min_child_samples': 11, 'subsample': 0.8541721187250789, 'colsample_bytree': 0.6774959072711848}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  54%|█████▍    | 27/50 [00:30<00:20,  1.14it/s]

[I 2025-07-02 04:58:52,127] Trial 26 finished with value: 0.5611494252873562 and parameters: {'n_estimators': 145, 'learning_rate': 0.0383177077316776, 'max_depth': 6, 'num_leaves': 65, 'min_child_samples': 19, 'subsample': 0.717929667574809, 'colsample_bytree': 0.7843124281787008}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  56%|█████▌    | 28/50 [00:31<00:17,  1.26it/s]

[I 2025-07-02 04:58:52,743] Trial 27 finished with value: 0.6498850574712643 and parameters: {'n_estimators': 121, 'learning_rate': 0.08314518049497346, 'max_depth': 3, 'num_leaves': 74, 'min_child_samples': 26, 'subsample': 0.6068132155449868, 'colsample_bytree': 0.7034265339625672}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  58%|█████▊    | 29/50 [00:32<00:16,  1.24it/s]

[I 2025-07-02 04:58:53,572] Trial 28 finished with value: 0.4995402298850575 and parameters: {'n_estimators': 167, 'learning_rate': 0.05250964717063836, 'max_depth': 5, 'num_leaves': 55, 'min_child_samples': 14, 'subsample': 0.7815831744332618, 'colsample_bytree': 0.7460992736530924}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  60%|██████    | 30/50 [00:32<00:16,  1.24it/s]

[I 2025-07-02 04:58:54,372] Trial 29 finished with value: 0.40344827586206905 and parameters: {'n_estimators': 219, 'learning_rate': 0.06040176597695537, 'max_depth': 6, 'num_leaves': 50, 'min_child_samples': 29, 'subsample': 0.8262267175649801, 'colsample_bytree': 0.6491061851391156}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  62%|██████▏   | 31/50 [00:33<00:16,  1.19it/s]

[I 2025-07-02 04:58:55,304] Trial 30 finished with value: 0.6632183908045978 and parameters: {'n_estimators': 166, 'learning_rate': 0.019791794585346842, 'max_depth': 7, 'num_leaves': 42, 'min_child_samples': 21, 'subsample': 0.8523478207871052, 'colsample_bytree': 0.8492645501107189}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  64%|██████▍   | 32/50 [00:34<00:14,  1.26it/s]

[I 2025-07-02 04:58:55,983] Trial 31 finished with value: 0.7657471264367817 and parameters: {'n_estimators': 182, 'learning_rate': 0.0324761151935046, 'max_depth': 3, 'num_leaves': 62, 'min_child_samples': 16, 'subsample': 0.699420746795459, 'colsample_bytree': 0.8349626141952251}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  66%|██████▌   | 33/50 [00:35<00:13,  1.30it/s]

[I 2025-07-02 04:58:56,702] Trial 32 finished with value: 0.7108045977011495 and parameters: {'n_estimators': 191, 'learning_rate': 0.03691814771640688, 'max_depth': 3, 'num_leaves': 65, 'min_child_samples': 17, 'subsample': 0.6465855071919175, 'colsample_bytree': 0.7926303761898653}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  68%|██████▊   | 34/50 [00:36<00:12,  1.27it/s]

[I 2025-07-02 04:58:57,531] Trial 33 finished with value: 0.7045977011494253 and parameters: {'n_estimators': 206, 'learning_rate': 0.023218171906527037, 'max_depth': 4, 'num_leaves': 69, 'min_child_samples': 13, 'subsample': 0.8067718342422535, 'colsample_bytree': 0.8595309735128023}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  70%|███████   | 35/50 [00:36<00:11,  1.30it/s]

[I 2025-07-02 04:58:58,260] Trial 34 finished with value: 0.7519540229885058 and parameters: {'n_estimators': 156, 'learning_rate': 0.04440968717463899, 'max_depth': 3, 'num_leaves': 62, 'min_child_samples': 16, 'subsample': 0.6871584622893268, 'colsample_bytree': 0.9182756644834025}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  72%|███████▏  | 36/50 [00:37<00:10,  1.35it/s]

[I 2025-07-02 04:58:58,931] Trial 35 finished with value: 0.7659770114942528 and parameters: {'n_estimators': 124, 'learning_rate': 0.030166288990847296, 'max_depth': 4, 'num_leaves': 76, 'min_child_samples': 37, 'subsample': 0.7366881728890383, 'colsample_bytree': 0.8696577110955087}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  74%|███████▍  | 37/50 [00:38<00:10,  1.23it/s]

[I 2025-07-02 04:58:59,905] Trial 36 finished with value: 0.7386206896551725 and parameters: {'n_estimators': 178, 'learning_rate': 0.018530203058305952, 'max_depth': 5, 'num_leaves': 65, 'min_child_samples': 10, 'subsample': 0.7154236203362213, 'colsample_bytree': 0.8262241609093086}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  76%|███████▌  | 38/50 [00:39<00:10,  1.13it/s]

[I 2025-07-02 04:59:00,967] Trial 37 finished with value: 0.38275862068965516 and parameters: {'n_estimators': 232, 'learning_rate': 0.04333811793693937, 'max_depth': 9, 'num_leaves': 58, 'min_child_samples': 26, 'subsample': 0.8252173486452076, 'colsample_bytree': 0.600535785316496}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  78%|███████▊  | 39/50 [00:40<00:09,  1.12it/s]

[I 2025-07-02 04:59:01,881] Trial 38 finished with value: 0.38988505747126434 and parameters: {'n_estimators': 199, 'learning_rate': 0.08026973906548238, 'max_depth': 5, 'num_leaves': 32, 'min_child_samples': 21, 'subsample': 0.7517627389975092, 'colsample_bytree': 0.7679239097640882}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  80%|████████  | 40/50 [00:41<00:09,  1.04it/s]

[I 2025-07-02 04:59:02,998] Trial 39 finished with value: 0.29402298850574715 and parameters: {'n_estimators': 188, 'learning_rate': 0.09770724001103469, 'max_depth': 6, 'num_leaves': 50, 'min_child_samples': 13, 'subsample': 0.8566738418595651, 'colsample_bytree': 0.93787330924262}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  82%|████████▏ | 41/50 [00:42<00:07,  1.16it/s]

[I 2025-07-02 04:59:03,622] Trial 40 finished with value: 0.5133333333333334 and parameters: {'n_estimators': 145, 'learning_rate': 0.11043127535138782, 'max_depth': 3, 'num_leaves': 72, 'min_child_samples': 29, 'subsample': 0.7669186319236739, 'colsample_bytree': 0.7163567675018468}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  84%|████████▍ | 42/50 [00:42<00:06,  1.23it/s]

[I 2025-07-02 04:59:04,319] Trial 41 finished with value: 0.773103448275862 and parameters: {'n_estimators': 124, 'learning_rate': 0.03216231748451476, 'max_depth': 4, 'num_leaves': 76, 'min_child_samples': 44, 'subsample': 0.7251011482096529, 'colsample_bytree': 0.8676471498558985}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  86%|████████▌ | 43/50 [00:43<00:05,  1.29it/s]

[I 2025-07-02 04:59:05,007] Trial 42 finished with value: 0.7935632183908046 and parameters: {'n_estimators': 116, 'learning_rate': 0.034080042892384595, 'max_depth': 4, 'num_leaves': 76, 'min_child_samples': 44, 'subsample': 0.7135992224166877, 'colsample_bytree': 0.8816986884160202}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 13. Best value: 0.82092:  88%|████████▊ | 44/50 [00:44<00:04,  1.37it/s]

[I 2025-07-02 04:59:05,629] Trial 43 finished with value: 0.7799999999999999 and parameters: {'n_estimators': 102, 'learning_rate': 0.03867664900974165, 'max_depth': 4, 'num_leaves': 77, 'min_child_samples': 43, 'subsample': 0.6674963462292461, 'colsample_bytree': 0.9652446225174216}. Best is trial 13 with value: 0.8209195402298851.


Best trial: 44. Best value: 0.882759:  90%|█████████ | 45/50 [00:44<00:03,  1.39it/s]

[I 2025-07-02 04:59:06,330] Trial 44 finished with value: 0.8827586206896552 and parameters: {'n_estimators': 109, 'learning_rate': 0.015623341558174138, 'max_depth': 4, 'num_leaves': 80, 'min_child_samples': 50, 'subsample': 0.62999767238412, 'colsample_bytree': 0.9898406510914921}. Best is trial 44 with value: 0.8827586206896552.


Best trial: 44. Best value: 0.882759:  92%|█████████▏| 46/50 [00:45<00:02,  1.37it/s]

[I 2025-07-02 04:59:07,092] Trial 45 finished with value: 0.8071264367816091 and parameters: {'n_estimators': 113, 'learning_rate': 0.01739199907399299, 'max_depth': 5, 'num_leaves': 80, 'min_child_samples': 50, 'subsample': 0.6298277629293044, 'colsample_bytree': 0.9949348381973084}. Best is trial 44 with value: 0.8827586206896552.


Best trial: 44. Best value: 0.882759:  94%|█████████▍| 47/50 [00:46<00:02,  1.36it/s]

[I 2025-07-02 04:59:07,825] Trial 46 finished with value: 0.8485057471264368 and parameters: {'n_estimators': 109, 'learning_rate': 0.014025958221350443, 'max_depth': 9, 'num_leaves': 80, 'min_child_samples': 49, 'subsample': 0.6337534913800635, 'colsample_bytree': 0.9732630086990204}. Best is trial 44 with value: 0.8827586206896552.


Best trial: 44. Best value: 0.882759:  96%|█████████▌| 48/50 [00:47<00:01,  1.35it/s]

[I 2025-07-02 04:59:08,576] Trial 47 finished with value: 0.43747126436781614 and parameters: {'n_estimators': 111, 'learning_rate': 0.14819625477874637, 'max_depth': 9, 'num_leaves': 80, 'min_child_samples': 50, 'subsample': 0.6291796391424488, 'colsample_bytree': 0.9968727074428936}. Best is trial 44 with value: 0.8827586206896552.


Best trial: 44. Best value: 0.882759:  98%|█████████▊| 49/50 [00:47<00:00,  1.37it/s]

[I 2025-07-02 04:59:09,289] Trial 48 finished with value: 0.8554022988505746 and parameters: {'n_estimators': 102, 'learning_rate': 0.015193783825877761, 'max_depth': 8, 'num_leaves': 80, 'min_child_samples': 47, 'subsample': 0.6272147945478914, 'colsample_bytree': 0.9585272680488164}. Best is trial 44 with value: 0.8827586206896552.


Best trial: 44. Best value: 0.882759: 100%|██████████| 50/50 [00:48<00:00,  1.03it/s]


[I 2025-07-02 04:59:09,968] Trial 49 finished with value: 0.8760919540229886 and parameters: {'n_estimators': 100, 'learning_rate': 0.012076449358663734, 'max_depth': 8, 'num_leaves': 73, 'min_child_samples': 47, 'subsample': 0.6207672015548812, 'colsample_bytree': 0.970380465365926}. Best is trial 44 with value: 0.8827586206896552.
Fold 1: Accuracy=0.8000, Precision=0.5000, Recall=0.0690, F1=0.1212, Threshold=0.50
Fold 2: Accuracy=0.8069, Precision=1.0000, Recall=0.0345, F1=0.0667, Threshold=0.50
Fold 3: Accuracy=0.8138, Precision=1.0000, Recall=0.0690, F1=0.1290, Threshold=0.50
Fold 4: Accuracy=0.8069, Precision=0.6667, Recall=0.1333, F1=0.2222, Threshold=0.50
Fold 5: Accuracy=0.7917, Precision=0.4000, Recall=0.0690, F1=0.1176, Threshold=0.50

--- Final LightGBM Optimized ---
Mean Accuracy : 0.8039
Mean Precision: 0.7133
Mean Recall   : 0.0749
Mean F1 Score : 0.1314


# CatBoost

In [20]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: CatBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        depth=6,
        auto_class_weights='Balanced',
        verbose=0,  # suppress CatBoost internal logs
        random_seed=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'CatBoost-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified+VerboseOff'

# Console summary
print("\n--- CatBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7448, Precision=0.2778,  Recall=0.1724, F1=0.2128
Fold 2: Accuracy=0.7586, Precision=0.3125,  Recall=0.1724, F1=0.2222
Fold 3: Accuracy=0.7379, Precision=0.3333,  Recall=0.3103, F1=0.3214
Fold 4: Accuracy=0.7724, Precision=0.4444,  Recall=0.4000, F1=0.4211
Fold 5: Accuracy=0.7153, Precision=0.2692,  Recall=0.2414, F1=0.2545

--- CatBoost Summary ---
Mean Accuracy : 0.7458
Mean Precision: 0.3275
Mean Recall   : 0.2593
Mean F1 Score : 0.2864

CSV Row Format:
CatBoost-Balanced,OneHot+Scaler+5Fold-Stratified+VerboseOff,0.7458,0.3275,0.2593,0.2864


In [21]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Tuned CatBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=5,
        border_count=128,
        bagging_temperature=1.0,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'CatBoost-Tuned'
model_desc = 'OneHot+Scaler+5Fold+Depth8+LR0.05+BagTemp1.0'

# Console summary
print("\n--- CatBoost Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7586, Precision=0.2857,  Recall=0.1379, F1=0.1860
Fold 2: Accuracy=0.7862, Precision=0.3750,  Recall=0.1034, F1=0.1622
Fold 3: Accuracy=0.7931, Precision=0.4783,  Recall=0.3793, F1=0.4231
Fold 4: Accuracy=0.7655, Precision=0.4000,  Recall=0.2667, F1=0.3200
Fold 5: Accuracy=0.7292, Precision=0.2917,  Recall=0.2414, F1=0.2642

--- CatBoost Tuned Summary ---
Mean Accuracy : 0.7665
Mean Precision: 0.3661
Mean Recall   : 0.2257
Mean F1 Score : 0.2711

CSV Row Format:
CatBoost-Tuned,OneHot+Scaler+5Fold+Depth8+LR0.05+BagTemp1.0,0.7665,0.3661,0.2257,0.2711


In [22]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Define column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Aggressively Tuned CatBoost
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=500,
        learning_rate=0.03,
        depth=10,
        l2_leaf_reg=3,
        border_count=128,
        bagging_temperature=0.25,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Description
model_name = 'CatBoost-Aggressive'
model_desc = 'OneHot+Scaler+500Iter+LR0.03+Depth10+Bag0.25'

print("\n--- CatBoost Aggressive Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7586, Precision=0.2500,  Recall=0.1034, F1=0.1463
Fold 2: Accuracy=0.7862, Precision=0.3333,  Recall=0.0690, F1=0.1143
Fold 3: Accuracy=0.8000, Precision=0.5000,  Recall=0.2759, F1=0.3556
Fold 4: Accuracy=0.7793, Precision=0.4375,  Recall=0.2333, F1=0.3043
Fold 5: Accuracy=0.7639, Precision=0.3810,  Recall=0.2759, F1=0.3200

--- CatBoost Aggressive Summary ---
Mean Accuracy : 0.7776
Mean Precision: 0.3804
Mean Recall   : 0.1915
Mean F1 Score : 0.2481

CSV Row Format:
CatBoost-Aggressive,OneHot+Scaler+500Iter+LR0.03+Depth10+Bag0.25,0.7776,0.3804,0.1915,0.2481


In [23]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Constant column check
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Categorical and numerical columns
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Define CatBoost with default settings (will be tuned)
cat_model = CatBoostClassifier(
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=0
)

# Create pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', cat_model)
])

# Define parameter search space
param_space = {
    'model__iterations': Integer(300, 800),
    'model__learning_rate': Real(0.01, 0.2, prior='log-uniform'),
    'model__depth': Integer(4, 10),
    'model__l2_leaf_reg': Real(1, 10),
    'model__bagging_temperature': Real(0, 1.0),
    'model__border_count': Integer(32, 254)
}

# Setup Bayesian optimization with 5-fold stratified CV
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Fit the search
opt.fit(X, y)

# Extract best pipeline and evaluate manually
best_pipeline = opt.best_estimator_

# Manual 5-Fold Eval
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Averages
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'CatBoost-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+5Fold'

print("\n--- CatBoost Bayesian Tuning Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7793, Precision=0.3333, Recall=0.1034, F1=0.1579
Fold 2: Accuracy=0.8000, Precision=0.5000, Recall=0.0690, F1=0.1212
Fold 3: Accuracy=0.8414, Precision=0.7500, Recall=0.3103, F1=0.4390
Fold 4: Accuracy=0.7724, Precision=0.3846, Recall=0.1667, F1=0.2326
Fold 5: Accuracy=0.7847, Precision=0.4167, Recall=0.1724, F1=0.2439

--- CatBoost Bayesian Tuning Summary ---
Mean Accuracy : 0.7956
Mean Precision: 0.4769
Mean Recall   : 0.1644
Mean F1 Score : 0.2389


# SVM

In [24]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# SVM model inside pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Mean scores
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'SVC-RBF-Pipeline'
model_desc = 'OneHot+Scaler+5Fold+Balanced'

print("\n--- SVM (RBF) Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.6759, Precision=0.2857, Recall=0.4138, F1=0.3380
Fold 2: Accuracy=0.7103, Precision=0.3143, Recall=0.3793, F1=0.3438
Fold 3: Accuracy=0.6828, Precision=0.3559, Recall=0.7241, F1=0.4773
Fold 4: Accuracy=0.6138, Precision=0.3088, Recall=0.7000, F1=0.4286
Fold 5: Accuracy=0.5694, Precision=0.2295, Recall=0.4828, F1=0.3111

--- SVM (RBF) Summary ---
Name                          : SVC-RBF-Pipeline
Description                   : OneHot+Scaler+5Fold+Balanced
Accuracy                      : 0.6504
Precision                     : 0.2989
Recall                        : 0.5400
F1 Score                      : 0.3797

CSV Row Format:
SVC-RBF-Pipeline,OneHot+Scaler+5Fold+Balanced,0.6504,0.2989,0.5400,0.3797


In [25]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 2: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 3: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 4: Accuracy=0.7931, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 5: Accuracy=0.7986, Precision=0.0000, Recall=0.0000, F1=0.0000

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.7983
Mean Precision: 0.0000
Mean Recall   : 0.0000
Mean F1 Score : 0.0000

CSV Row Format:
SVC-BayesTuned,OneHot+Scaler+BayesSearch+RBF+Balanced,0.7983,0.0000,0.0000,0.0000


In [26]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 2: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 3: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 4: Accuracy=0.7931, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 5: Accuracy=0.7986, Precision=0.0000, Recall=0.0000, F1=0.0000

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.7983
Mean Precision: 0.0000
Mean Recall   : 0.0000
Mean F1 Score : 0.0000

CSV Row Format:
SVC-BayesTuned,OneHot+Scaler+BayesSearch+RBF+Balanced,0.7983,0.0000,0.0000,0.0000


In [27]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space for BayesSearchCV
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization focused on RECALL
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='recall',  # prioritize recall
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned-Recall'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced+RecallOpt'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.2069, Precision=0.2014, Recall=1.0000, F1=0.3353
Fold 2: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333
Fold 3: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333
Fold 4: Accuracy=0.2069, Precision=0.2069, Recall=1.0000, F1=0.3429
Fold 5: Accuracy=0.7917, Precision=0.0000, Recall=0.0000, F1=0.0000

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.3211
Mean Precision: 0.1617
Mean Recall   : 0.8000
Mean F1 Score : 0.2690

CSV Row Format:
SVC-BayesTuned-Recall,OneHot+Scaler+BayesSearch+RBF+Balanced+RecallOpt,0.3211,0.1617,0.8000,0.2690


# Bagging

In [28]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Bagging Classifier pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        n_estimators=50,
        max_samples=0.8,
        max_features=1.0,
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    ))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'Bagging-DecisionTree'
model_desc = 'Bagging-with-Preprocessing-5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7724, Precision=0.3000, Recall=0.1034, F1=0.1538
Fold 2: Accuracy=0.8138, Precision=0.7500, Recall=0.1034, F1=0.1818
Fold 3: Accuracy=0.8069, Precision=0.5455, Recall=0.2069, F1=0.3000
Fold 4: Accuracy=0.8138, Precision=0.6667, Recall=0.2000, F1=0.3077
Fold 5: Accuracy=0.7847, Precision=0.4000, Recall=0.1379, F1=0.2051

--- Model Summary ---
Name                          : Bagging-DecisionTree
Description                   : Bagging-with-Preprocessing-5Fold
Accuracy                      : 0.7983
Precision                     : 0.5324
Recall                        : 0.1503
F1 Score                      : 0.2297

CSV Row Format:
Bagging-DecisionTree,Bagging-with-Preprocessing-5Fold,0.7983,0.5324,0.1503,0.2297


In [29]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        random_state=42,
        n_jobs=-1
    ))
])

# Parameter search space for Bagging + Decision Tree
search_space = {
    'model__n_estimators': Integer(10, 100),
    'model__max_samples': Real(0.5, 1.0),
    'model__max_features': Real(0.5, 1.0),
    'model__estimator__max_depth': Integer(2, 20),
    'model__estimator__min_samples_split': Integer(2, 10),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# BayesSearchCV setup (recall as scoring metric)
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit
bayes_search.fit(X, y)

# Best model
best_model = bayes_search.best_estimator_

# 5-Fold Evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'Bagging+DT-Tuned'
model_desc = 'BayesCV-Tuned-Recall-Max-5Fold'

print("\n--- Final Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7931, Precision=0.4444, Recall=0.1379, F1=0.2105
Fold 2: Accuracy=0.8000, Precision=0.5000, Recall=0.1034, F1=0.1714
Fold 3: Accuracy=0.8207, Precision=0.5652, Recall=0.4483, F1=0.5000
Fold 4: Accuracy=0.7793, Precision=0.4375, Recall=0.2333, F1=0.3043
Fold 5: Accuracy=0.7847, Precision=0.4375, Recall=0.2414, F1=0.3111

--- Final Tuned Model Summary ---
Name                          : Bagging+DT-Tuned
Description                   : BayesCV-Tuned-Recall-Max-5Fold
Accuracy                      : 0.7956
Precision                     : 0.4769
Recall                        : 0.2329
F1 Score                      : 0.2995

CSV Row Format:
Bagging+DT-Tuned,BayesCV-Tuned-Recall-Max-5Fold,0.7956,0.4769,0.2329,0.2995


In [30]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        random_state=42,
        n_jobs=-1
    ))
])

# Parameter search space for Bagging + Decision Tree
search_space = {
    'model__n_estimators': Integer(10, 100),
    'model__max_samples': Real(0.5, 1.0),
    'model__max_features': Real(0.5, 1.0),
    'model__estimator__max_depth': Integer(2, 20),
    'model__estimator__min_samples_split': Integer(2, 10),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# BayesSearchCV setup (recall as scoring metric)
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit
bayes_search.fit(X, y)

# Best model
best_model = bayes_search.best_estimator_

# 5-Fold Evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'Bagging+DT-Tuned'
model_desc = 'BayesCV-Tuned-Recall-Max-5Fold'

print("\n--- Final Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7931, Precision=0.4444, Recall=0.1379, F1=0.2105
Fold 2: Accuracy=0.8000, Precision=0.5000, Recall=0.1034, F1=0.1714
Fold 3: Accuracy=0.8207, Precision=0.5652, Recall=0.4483, F1=0.5000
Fold 4: Accuracy=0.7793, Precision=0.4375, Recall=0.2333, F1=0.3043
Fold 5: Accuracy=0.7847, Precision=0.4375, Recall=0.2414, F1=0.3111

--- Final Tuned Model Summary ---
Name                          : Bagging+DT-Tuned
Description                   : BayesCV-Tuned-Recall-Max-5Fold
Accuracy                      : 0.7956
Precision                     : 0.4769
Recall                        : 0.2329
F1 Score                      : 0.2995

CSV Row Format:
Bagging+DT-Tuned,BayesCV-Tuned-Recall-Max-5Fold,0.7956,0.4769,0.2329,0.2995


In [31]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# --- Data Setup ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Pipeline ---
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(class_weight='balanced'),
        n_estimators=50,
        max_samples=0.8,
        max_features=1.0,
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    ))
])

# --- Search Space ---
search_space = {
    'model__n_estimators': Integer(20, 100),
    'model__max_samples': Real(0.4, 1.0),
    'model__max_features': Real(0.4, 1.0),
    'model__estimator__max_depth': Integer(3, 20),
    'model__estimator__min_samples_split': Integer(2, 15),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# --- Tuning ---
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# --- Fit ---
bayes_search.fit(X, y)
best_model = bayes_search.best_estimator_

# --- Cross-Validation Evaluation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# --- Final Metrics ---
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# --- Output ---
model_name = 'Bagging+DT-Balanced-Tuned'
model_desc = 'BaggingDT+Balanced+BayesCV-Recall'

print("\n--- Final Tuned Bagging Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7103, Precision=0.3488, Recall=0.5172, F1=0.4167
Fold 2: Accuracy=0.7379, Precision=0.3846, Recall=0.5172, F1=0.4412
Fold 3: Accuracy=0.6621, Precision=0.3529, Recall=0.8276, F1=0.4948
Fold 4: Accuracy=0.6759, Precision=0.3768, Recall=0.8667, F1=0.5253
Fold 5: Accuracy=0.5833, Precision=0.2687, Recall=0.6207, F1=0.3750

--- Final Tuned Bagging Model Summary ---
Name                          : Bagging+DT-Balanced-Tuned
Description                   : BaggingDT+Balanced+BayesCV-Recall
Accuracy                      : 0.6739
Precision                     : 0.3464
Recall                        : 0.6699
F1 Score                      : 0.4506

CSV Row Format:
Bagging+DT-Balanced-Tuned,BaggingDT+Balanced+BayesCV-Recall,0.6739,0.3464,0.6699,0.4506


# AdaBoostClassifier

In [32]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ]
)

# AdaBoost model pipeline
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', AdaBoostClassifier(random_state=42))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'AdaBoostClassifier'
model_desc = 'AdaBoost-5Fold-Preprocessed'

# Print formatted summary
print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV (append row, create file if not exists)
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7586, Precision=0.2000, Recall=0.0690, F1=0.1026
Fold 2: Accuracy=0.7931, Precision=0.4286, Recall=0.1034, F1=0.1667
Fold 3: Accuracy=0.8276, Precision=0.6667, Recall=0.2759, F1=0.3902
Fold 4: Accuracy=0.8069, Precision=0.6000, Recall=0.2000, F1=0.3000
Fold 5: Accuracy=0.7639, Precision=0.3684, Recall=0.2414, F1=0.2917

--- Model Summary ---
Name                          : AdaBoostClassifier
Description                   : AdaBoost-5Fold-Preprocessed
Accuracy                      : 0.7900
Precision                     : 0.4527
Recall                        : 0.1779
F1 Score                      : 0.2502

CSV Row Format:
AdaBoostClassifier,AdaBoost-5Fold-Preprocessed,0.7900,0.4527,0.1779,0.2502


In [33]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from skopt import BayesSearchCV
from skopt.space import Integer, Real

import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# AdaBoost pipeline
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', AdaBoostClassifier(algorithm='SAMME', random_state=42))
])

# Hyperparameter search space (no 'SAMME.R')
search_space = {
    'classifier__n_estimators': Integer(50, 300),
    'classifier__learning_rate': Real(0.01, 1.0, prior='log-uniform')
}

# CV and tuner
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
opt = BayesSearchCV(
    pipe,
    search_spaces=search_space,
    scoring='recall',
    n_iter=25,
    cv=cv,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Fit the tuner
opt.fit(X, y)

# Final best model
best_model = opt.best_estimator_

# CV metric evaluation using best model
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'AdaBoostClassifier-Tuned'
model_desc = f"AdaBoost-Tuned-SAMME-{opt.best_params_}"

# Print summary
print("\n--- Tuned AdaBoost Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# CSV write
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7793, Precision=0.3636, Recall=0.1379, F1=0.2000
Fold 2: Accuracy=0.7862, Precision=0.3750, Recall=0.1034, F1=0.1622
Fold 3: Accuracy=0.7793, Precision=0.4000, Recall=0.2069, F1=0.2727
Fold 4: Accuracy=0.7655, Precision=0.4091, Recall=0.3000, F1=0.3462
Fold 5: Accuracy=0.7847, Precision=0.4444, Recall=0.2759, F1=0.3404

--- Tuned AdaBoost Summary ---
Name                          : AdaBoostClassifier-Tuned
Description                   : AdaBoost-Tuned-SAMME-OrderedDict({'classifier__learning_rate': 1.0, 'classifier__n_estimators': 182})
Accuracy                      : 0.7790
Precision                     : 0.3984
Recall                        : 0.2048
F1 Score                      : 0.2643


# BalancedBaggingClassifier with a DecisionTreeClassifier(max_depth=6)

In [34]:
import os
import numpy as np
import pandas as pd
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Assume df_clean is preloaded
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Preprocessing (same as your other pipelines)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# Classifier setup
base_estimator = DecisionTreeClassifier(max_depth=6, random_state=42)
clf = BalancedBaggingClassifier(
    estimator=base_estimator,
    n_estimators=50,
    sampling_strategy='auto',
    replacement=False,
    random_state=42,
    n_jobs=-1
)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Aggregate results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model metadata
model_name = 'BalancedBagging-DecisionTree'
model_desc = 'Bagging+Balanced+DT(max_depth=6)+5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save results
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7103, Precision=0.3415, Recall=0.4828, F1=0.4000
Fold 2: Accuracy=0.7586, Precision=0.4118, Recall=0.4828, F1=0.4444
Fold 3: Accuracy=0.6069, Precision=0.2879, Recall=0.6552, F1=0.4000
Fold 4: Accuracy=0.7103, Precision=0.3966, Recall=0.7667, F1=0.5227
Fold 5: Accuracy=0.6111, Precision=0.2787, Recall=0.5862, F1=0.3778

--- Model Summary ---
Name                          : BalancedBagging-DecisionTree
Description                   : Bagging+Balanced+DT(max_depth=6)+5Fold
Accuracy                      : 0.6795
Precision                     : 0.3433
Recall                        : 0.5947
F1 Score                      : 0.4290

CSV Row Format:
BalancedBagging-DecisionTree,Bagging+Balanced+DT(max_depth=6)+5Fold,0.6795,0.3433,0.5947,0.4290


# EasyEnsembleClassifier

In [35]:
import os
import numpy as np
import pandas as pd
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Dataset
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Feature columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# Classifier: EasyEnsemble with default AdaBoost base
clf = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

# Stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Aggregate metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'EasyEnsembleClassifier'
model_desc = 'Ensemble+Undersampling+AdaBoost+5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.6690, Precision=0.3137, Recall=0.5517, F1=0.4000
Fold 2: Accuracy=0.7172, Precision=0.3571, Recall=0.5172, F1=0.4225
Fold 3: Accuracy=0.5931, Precision=0.2857, Recall=0.6897, F1=0.4040
Fold 4: Accuracy=0.6069, Precision=0.3247, Recall=0.8333, F1=0.4673
Fold 5: Accuracy=0.5486, Precision=0.2429, Recall=0.5862, F1=0.3434

--- Model Summary ---
Name                          : EasyEnsembleClassifier
Description                   : Ensemble+Undersampling+AdaBoost+5Fold
Accuracy                      : 0.6270
Precision                     : 0.3048
Recall                        : 0.6356
F1 Score                      : 0.4075

CSV Row Format:
EasyEnsembleClassifier,Ensemble+Undersampling+AdaBoost+5Fold,0.6270,0.3048,0.6356,0.4075


# EasyEnsembleClassifier

In [36]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.ensemble import EasyEnsembleClassifier
import warnings

warnings.filterwarnings('ignore')

# Target and features
current_df = df_clean.copy()  # Ensure `df_clean` is already cleaned
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Initialize EasyEnsembleClassifier
base_model = EasyEnsembleClassifier(random_state=42, n_estimators=10)

# Preprocessing for categorical and numeric features
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Feature selection
feature_selector = SelectFromModel(estimator=RandomForestClassifier(random_state=42), max_features=20)

# Threshold tuning range
thresholds = np.linspace(0.1, 0.5, 5)

# Pipeline setup
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('classifier', base_model)
])

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    # Threshold tuning
    best_metrics = {'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0, 'threshold': 0}
    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        
        if f1 > best_metrics['f1']:
            best_metrics = {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'threshold': thresh}
    
    accuracy_list.append(best_metrics['acc'])
    precision_list.append(best_metrics['prec'])
    recall_list.append(best_metrics['rec'])
    f1_list.append(best_metrics['f1'])
    
    print(f"Fold {fold}: Threshold={best_metrics['threshold']:.2f}, Accuracy={best_metrics['acc']:.4f}, "
          f"Precision={best_metrics['prec']:.4f}, Recall={best_metrics['rec']:.4f}, F1={best_metrics['f1']:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'EasyEnsembleClassifier+ThresholdTuning'
model_desc = 'EEC-ThresholdTuning-5Fold'

# Print formatted summary
print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Threshold=0.40, Accuracy=0.4276, Precision=0.2500, Recall=0.9310, F1=0.3942
Fold 2: Threshold=0.50, Accuracy=0.6966, Precision=0.3469, Recall=0.5862, F1=0.4359
Fold 3: Threshold=0.50, Accuracy=0.6000, Precision=0.2899, Recall=0.6897, F1=0.4082
Fold 4: Threshold=0.50, Accuracy=0.6552, Precision=0.3571, Recall=0.8333, F1=0.5000
Fold 5: Threshold=0.50, Accuracy=0.5833, Precision=0.2754, Recall=0.6552, F1=0.3878

--- Model Summary ---
Name                          : EasyEnsembleClassifier+ThresholdTuning
Description                   : EEC-ThresholdTuning-5Fold
Accuracy                      : 0.5925
Precision                     : 0.3039
Recall                        : 0.7391
F1 Score                      : 0.4252


In [37]:
import os
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

# --- Custom Transformer for Top 20 Features ---
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, model, k=20):
        self.model = model
        self.k = k
        self.top_indices = None

    def fit(self, X, y):
        self.model.fit(X, y)
        if hasattr(self.model, "feature_importances_"):
            importances = self.model.feature_importances_
        else:
            raise AttributeError("Model must have feature_importances_")
        self.top_indices = np.argsort(importances)[::-1][:self.k]
        return self

    def transform(self, X):
        return X[:, self.top_indices]

# --- Load data ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# --- Identify column types ---
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Pipeline components ---
rf_for_selection = RandomForestClassifier(n_estimators=100, random_state=42)
top_k_selector = TopFeatureSelector(model=rf_for_selection, k=20)

model = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', top_k_selector),
    ('model', model)
])

# --- CV and threshold tuning ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.arange(0.1, 0.51, 0.05)

best_metrics = {'threshold': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}

# Try all thresholds
for threshold in thresholds:
    acc_list, prec_list, rec_list, f1_list = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        y_proba = pipeline.predict_proba(X_val)[:, 1]
        y_pred = (y_proba >= threshold).astype(int)

        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        acc_list.append(acc)
        prec_list.append(prec)
        rec_list.append(rec)
        f1_list.append(f1)

    mean_acc = np.mean(acc_list)
    mean_prec = np.mean(prec_list)
    mean_rec = np.mean(rec_list)
    mean_f1 = np.mean(f1_list)

    print(f"Threshold={threshold:.2f} | Accuracy={mean_acc:.4f}, Precision={mean_prec:.4f}, Recall={mean_rec:.4f}, F1={mean_f1:.4f}")

    if (
        mean_rec > best_metrics['recall'] and
        mean_prec > 0.6 and
        mean_f1 > 0.7 and
        mean_acc > 0.8
    ):
        best_metrics.update({
            'threshold': threshold,
            'accuracy': mean_acc,
            'precision': mean_prec,
            'recall': mean_rec,
            'f1': mean_f1
        })

# --- Fallback if no threshold met all strict criteria ---
if best_metrics['f1'] == 0:
    print("\nNo threshold met strict criteria. Falling back to best F1 score.")
    best_f1 = 0
    for threshold in thresholds:
        acc_list, prec_list, rec_list, f1_list = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            pipeline.fit(X_train, y_train)
            y_proba = pipeline.predict_proba(X_val)[:, 1]
            y_pred = (y_proba >= threshold).astype(int)

            acc_list.append(accuracy_score(y_val, y_pred))
            prec_list.append(precision_score(y_val, y_pred, zero_division=0))
            rec_list.append(recall_score(y_val, y_pred))
            f1_list.append(f1_score(y_val, y_pred))

        mean_f1 = np.mean(f1_list)
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_metrics.update({
                'threshold': threshold,
                'accuracy': np.mean(acc_list),
                'precision': np.mean(prec_list),
                'recall': np.mean(rec_list),
                'f1': mean_f1
            })

# --- Reporting ---
model_name = 'EasyEnsemble-Top20Feat+Thresh'
model_desc = f'5Fold-EEC+Top20Selector+ThreshTuned-{best_metrics["threshold"]:.2f}'

print("\n--- Best Threshold Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {best_metrics['accuracy']:.4f}")
print(f"{'Precision':<30}: {best_metrics['precision']:.4f}")
print(f"{'Recall':<30}: {best_metrics['recall']:.4f}")
print(f"{'F1 Score':<30}: {best_metrics['f1']:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{best_metrics['accuracy']:.4f},{best_metrics['precision']:.4f},{best_metrics['recall']:.4f},{best_metrics['f1']:.4f}")

# --- Save results ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(best_metrics['accuracy'], 4),
    'Precision': round(best_metrics['precision'], 4),
    'Recall': round(best_metrics['recall'], 4),
    'F1 Score': round(best_metrics['f1'], 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Threshold=0.10 | Accuracy=0.2017, Precision=0.2017, Recall=1.0000, F1=0.3356
Threshold=0.15 | Accuracy=0.2017, Precision=0.2017, Recall=1.0000, F1=0.3356
Threshold=0.20 | Accuracy=0.2017, Precision=0.2017, Recall=1.0000, F1=0.3356
Threshold=0.25 | Accuracy=0.2030, Precision=0.2019, Recall=1.0000, F1=0.3360
Threshold=0.30 | Accuracy=0.2459, Precision=0.2113, Recall=1.0000, F1=0.3488
Threshold=0.35 | Accuracy=0.3080, Precision=0.2258, Recall=1.0000, F1=0.3684
Threshold=0.40 | Accuracy=0.3522, Precision=0.2349, Recall=0.9793, F1=0.3788
Threshold=0.45 | Accuracy=0.4848, Precision=0.2653, Recall=0.8761, F1=0.4067
Threshold=0.50 | Accuracy=0.6491, Precision=0.3220, Recall=0.6563, F1=0.4283

No threshold met strict criteria. Falling back to best F1 score.

--- Best Threshold Tuned Model Summary ---
Name                          : EasyEnsemble-Top20Feat+Thresh
Description                   : 5Fold-EEC+Top20Selector+ThreshTuned-0.50
Accuracy                      : 0.6491
Precision              

# BalancedBaggingClassifier + LightGBM

In [38]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# --- Data setup ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# --- Column types ---
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Classifier setup ---
lgbm = LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

bbc = BalancedBaggingClassifier(
    estimator=lgbm,
    n_estimators=10,
    sampling_strategy='auto',
    replacement=False,
    random_state=42,
    n_jobs=-1
)

# --- Full pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', bbc)
])

# --- Evaluation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.arange(0.1, 0.51, 0.05)
best_metrics = {'threshold': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}

for threshold in thresholds:
    acc_list, prec_list, rec_list, f1_list = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        y_proba = pipeline.predict_proba(X_val)[:, 1]
        y_pred = (y_proba >= threshold).astype(int)

        acc_list.append(accuracy_score(y_val, y_pred))
        prec_list.append(precision_score(y_val, y_pred, zero_division=0))
        rec_list.append(recall_score(y_val, y_pred))
        f1_list.append(f1_score(y_val, y_pred))

    mean_acc = np.mean(acc_list)
    mean_prec = np.mean(prec_list)
    mean_rec = np.mean(rec_list)
    mean_f1 = np.mean(f1_list)

    print(f"Threshold={threshold:.2f} | Accuracy={mean_acc:.4f}, Precision={mean_prec:.4f}, Recall={mean_rec:.4f}, F1={mean_f1:.4f}")

    if (
        mean_rec > best_metrics['recall'] and
        mean_prec > 0.6 and
        mean_f1 > 0.7 and
        mean_acc > 0.8
    ):
        best_metrics.update({
            'threshold': threshold,
            'accuracy': mean_acc,
            'precision': mean_prec,
            'recall': mean_rec,
            'f1': mean_f1
        })

# --- Fallback to best F1 if no strict threshold matched ---
if best_metrics['f1'] == 0:
    print("\nNo threshold met strict criteria. Falling back to best F1 score.")
    best_f1 = 0
    for threshold in thresholds:
        acc_list, prec_list, rec_list, f1_list = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            pipeline.fit(X_train, y_train)
            y_proba = pipeline.predict_proba(X_val)[:, 1]
            y_pred = (y_proba >= threshold).astype(int)

            acc_list.append(accuracy_score(y_val, y_pred))
            prec_list.append(precision_score(y_val, y_pred, zero_division=0))
            rec_list.append(recall_score(y_val, y_pred))
            f1_list.append(f1_score(y_val, y_pred))

        mean_f1 = np.mean(f1_list)
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_metrics.update({
                'threshold': threshold,
                'accuracy': np.mean(acc_list),
                'precision': np.mean(prec_list),
                'recall': np.mean(rec_list),
                'f1': mean_f1
            })

# --- Reporting ---
model_name = 'BalancedBagging-LGBM'
model_desc = f'5Fold-BBC+LGBM-ThresholdTuned-{best_metrics["threshold"]:.2f}'

print("\n--- Best Threshold Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {best_metrics['accuracy']:.4f}")
print(f"{'Precision':<30}: {best_metrics['precision']:.4f}")
print(f"{'Recall':<30}: {best_metrics['recall']:.4f}")
print(f"{'F1 Score':<30}: {best_metrics['f1']:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{best_metrics['accuracy']:.4f},{best_metrics['precision']:.4f},{best_metrics['recall']:.4f},{best_metrics['f1']:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(best_metrics['accuracy'], 4),
    'Precision': round(best_metrics['precision'], 4),
    'Recall': round(best_metrics['recall'], 4),
    'F1 Score': round(best_metrics['f1'], 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Threshold=0.10 | Accuracy=0.4019, Precision=0.2461, Recall=0.9517, F1=0.3910
Threshold=0.15 | Accuracy=0.4557, Precision=0.2591, Recall=0.9039, F1=0.4024
Threshold=0.20 | Accuracy=0.4957, Precision=0.2696, Recall=0.8625, F1=0.4099
Threshold=0.25 | Accuracy=0.5358, Precision=0.2777, Recall=0.8005, F1=0.4107
Threshold=0.30 | Accuracy=0.5634, Precision=0.2841, Recall=0.7593, F1=0.4120
Threshold=0.35 | Accuracy=0.5993, Precision=0.2997, Recall=0.7248, F1=0.4213
Threshold=0.40 | Accuracy=0.6214, Precision=0.3018, Recall=0.6630, F1=0.4114
Threshold=0.45 | Accuracy=0.6491, Precision=0.3142, Recall=0.6218, F1=0.4129
Threshold=0.50 | Accuracy=0.6684, Precision=0.3048, Recall=0.5062, F1=0.3758

No threshold met strict criteria. Falling back to best F1 score.

--- Best Threshold Tuned Model Summary ---
Name                          : BalancedBagging-LGBM
Description                   : 5Fold-BBC+LGBM-ThresholdTuned-0.35
Accuracy                      : 0.5993
Precision                     : 0.2997

# StackingClassifier with Threshold Tuning

In [39]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier

import warnings
warnings.filterwarnings('ignore')

# --- Data prep ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Base models ---
cat = CatBoostClassifier(verbose=0, random_state=42)
lgbm = LGBMClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
svc = SVC(kernel='rbf', C=1, probability=True, random_state=42)
bbc = BalancedBaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=6, random_state=42),
    n_estimators=10,
    sampling_strategy='auto',
    replacement=False,
    random_state=42,
    n_jobs=-1
)

# --- Meta model ---
meta_model = LogisticRegression(max_iter=1000)

# --- Stacking ---
stacking_model = StackingClassifier(
    estimators=[
        ('cat', cat),
        ('lgbm', lgbm),
        ('svc', svc),
        ('bbc', bbc)
    ],
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=True
)

# --- Full pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', stacking_model)
])

# --- CV + Threshold tuning ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.arange(0.1, 0.51, 0.05)
best_metrics = {'threshold': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}

for threshold in thresholds:
    acc_list, prec_list, rec_list, f1_list = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        y_proba = pipeline.predict_proba(X_val)[:, 1]
        y_pred = (y_proba >= threshold).astype(int)

        acc_list.append(accuracy_score(y_val, y_pred))
        prec_list.append(precision_score(y_val, y_pred, zero_division=0))
        rec_list.append(recall_score(y_val, y_pred))
        f1_list.append(f1_score(y_val, y_pred))

    mean_acc = np.mean(acc_list)
    mean_prec = np.mean(prec_list)
    mean_rec = np.mean(rec_list)
    mean_f1 = np.mean(f1_list)

    print(f"Threshold={threshold:.2f} | Accuracy={mean_acc:.4f}, Precision={mean_prec:.4f}, Recall={mean_rec:.4f}, F1={mean_f1:.4f}")

    if (
        mean_rec > best_metrics['recall'] and
        mean_prec > 0.6 and
        mean_f1 > 0.7 and
        mean_acc > 0.8
    ):
        best_metrics.update({
            'threshold': threshold,
            'accuracy': mean_acc,
            'precision': mean_prec,
            'recall': mean_rec,
            'f1': mean_f1
        })

# --- Fallback: best F1 if strict criteria fail ---
if best_metrics['f1'] == 0:
    print("\nNo threshold met strict criteria. Falling back to best F1 score.")
    best_f1 = 0
    for threshold in thresholds:
        acc_list, prec_list, rec_list, f1_list = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            pipeline.fit(X_train, y_train)
            y_proba = pipeline.predict_proba(X_val)[:, 1]
            y_pred = (y_proba >= threshold).astype(int)

            acc_list.append(accuracy_score(y_val, y_pred))
            prec_list.append(precision_score(y_val, y_pred, zero_division=0))
            rec_list.append(recall_score(y_val, y_pred))
            f1_list.append(f1_score(y_val, y_pred))

        mean_f1 = np.mean(f1_list)
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_metrics.update({
                'threshold': threshold,
                'accuracy': np.mean(acc_list),
                'precision': np.mean(prec_list),
                'recall': np.mean(rec_list),
                'f1': mean_f1
            })

# --- Reporting ---
model_name = 'Stacking-CatLGBMSVCBBC'
model_desc = f'StackingCatLGBMSVCBBC+LogRegMeta+ThreshTuned-{best_metrics["threshold"]:.2f}'

print("\n--- Best Threshold Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {best_metrics['accuracy']:.4f}")
print(f"{'Precision':<30}: {best_metrics['precision']:.4f}")
print(f"{'Recall':<30}: {best_metrics['recall']:.4f}")
print(f"{'F1 Score':<30}: {best_metrics['f1']:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{best_metrics['accuracy']:.4f},{best_metrics['precision']:.4f},{best_metrics['recall']:.4f},{best_metrics['f1']:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(best_metrics['accuracy'], 4),
    'Precision': round(best_metrics['precision'], 4),
    'Recall': round(best_metrics['recall'], 4),
    'F1 Score': round(best_metrics['f1'], 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Threshold=0.10 | Accuracy=0.4364, Precision=0.2368, Recall=0.8071, F1=0.3655
Threshold=0.15 | Accuracy=0.5676, Precision=0.2712, Recall=0.6701, F1=0.3848
Threshold=0.20 | Accuracy=0.6532, Precision=0.3174, Recall=0.5949, F1=0.4107
Threshold=0.25 | Accuracy=0.6988, Precision=0.3353, Recall=0.4589, F1=0.3843
Threshold=0.30 | Accuracy=0.7209, Precision=0.3338, Recall=0.3563, F1=0.3411
Threshold=0.35 | Accuracy=0.7320, Precision=0.3234, Recall=0.2883, F1=0.3022
Threshold=0.40 | Accuracy=0.7625, Precision=0.3769, Recall=0.2405, F1=0.2896
Threshold=0.45 | Accuracy=0.7790, Precision=0.4052, Recall=0.1784, F1=0.2443
Threshold=0.50 | Accuracy=0.7901, Precision=0.4481, Recall=0.1237, F1=0.1867

No threshold met strict criteria. Falling back to best F1 score.

--- Best Threshold Tuned Model Summary ---
Name                          : Stacking-CatLGBMSVCBBC
Description                   : StackingCatLGBMSVCBBC+LogRegMeta+ThreshTuned-0.20
Accuracy                      : 0.6532
Precision            

In [40]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# --- Prepare Data ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# --- Preprocessing ---
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Models ---
svc = Pipeline([
    ('pre', preprocessor),
    ('model', SVC(
        kernel='rbf',
        C=10, gamma=0.01,
        probability=True,
        random_state=42
    ))
])

adaboost = Pipeline([
    ('pre', preprocessor),
    ('model', AdaBoostClassifier(
        n_estimators=150,
        learning_rate=0.6,
        random_state=42
    ))
])

catboost = Pipeline([
    ('pre', preprocessor),
    ('model', CatBoostClassifier(
        iterations=250,
        learning_rate=0.04,
        depth=6,
        l2_leaf_reg=3,
        verbose=0,
        random_seed=42
    ))
])

# --- CV Setup ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# --- Weights & Threshold ---
weights = [1, 2, 2]  # svc, adaboost, catboost
threshold = 0.45

# --- 5-Fold Evaluation ---
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    svc.fit(X_train, y_train)
    adaboost.fit(X_train, y_train)
    catboost.fit(X_train, y_train)

    svc_proba = svc.predict_proba(X_val)[:, 1]
    ada_proba = adaboost.predict_proba(X_val)[:, 1]
    cat_proba = catboost.predict_proba(X_val)[:, 1]

    # Manual soft voting
    blended_proba = (
        weights[0] * svc_proba +
        weights[1] * ada_proba +
        weights[2] * cat_proba
    ) / sum(weights)

    y_pred = (blended_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# --- Final Metrics ---
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

model_name = 'ManualSoftVoting-Cat+Ada+SVC'
model_desc = 'ManualSoftVoting-Weights[1,2,2]-Thresh0.45'

print("\n--- Ensemble Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7862, Precision=0.3333, Recall=0.0690, F1=0.1143
Fold 2: Accuracy=0.8069, Precision=0.6000, Recall=0.1034, F1=0.1765
Fold 3: Accuracy=0.8276, Precision=0.7500, Recall=0.2069, F1=0.3243
Fold 4: Accuracy=0.7793, Precision=0.4286, Recall=0.2000, F1=0.2727
Fold 5: Accuracy=0.7639, Precision=0.3077, Recall=0.1379, F1=0.1905

--- Ensemble Model Summary ---
Name                          : ManualSoftVoting-Cat+Ada+SVC
Description                   : ManualSoftVoting-Weights[1,2,2]-Thresh0.45
Accuracy                      : 0.7928
Precision                     : 0.4839
Recall                        : 0.1434
F1 Score                      : 0.2157

CSV Row Format:
ManualSoftVoting-Cat+Ada+SVC,ManualSoftVoting-Weights[1,2,2]-Thresh0.45,0.7928,0.4839,0.1434,0.2157


In [41]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.ensemble import EasyEnsembleClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# --- Prepare Data ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# --- Preprocessing ---
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Models ---
xgb = Pipeline([
    ('pre', preprocessor),
    ('model', XGBClassifier(
        n_estimators=150,
        max_depth=5,
        learning_rate=0.08,
        subsample=0.85,
        colsample_bytree=0.8,
        scale_pos_weight=3,  # important for recall
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42
    ))
])

easy = Pipeline([
    ('pre', preprocessor),
    ('model', EasyEnsembleClassifier(
        n_estimators=10,
        sampling_strategy='auto',
        random_state=42
    ))
])

# --- Meta-Classifier ---
meta_clf = LogisticRegression(C=1.0, class_weight='balanced', solver='liblinear', random_state=42)

# --- CV Setup ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
threshold = 0.45  # Tune this if needed

# --- 5-Fold Manual Stacking ---
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    xgb.fit(X_train, y_train)
    easy.fit(X_train, y_train)

    xgb_proba = xgb.predict_proba(X_val)[:, 1]
    easy_proba = easy.predict_proba(X_val)[:, 1]

    # Stack probabilities for meta-classifier
    meta_X_train = np.vstack((xgb.predict_proba(X_train)[:, 1], easy.predict_proba(X_train)[:, 1])).T
    meta_y_train = y_train

    meta_X_val = np.vstack((xgb_proba, easy_proba)).T

    meta_clf.fit(meta_X_train, meta_y_train)
    meta_proba = meta_clf.predict_proba(meta_X_val)[:, 1]
    y_pred = (meta_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# --- Final Metrics ---
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

model_name = 'Stacking-XGB+EasyEnsemble'
model_desc = 'Base[XGB,Easy],Meta[LogReg],Thresh0.45'

print("\n--- Ensemble Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7793, Precision=0.4118, Recall=0.2414, F1=0.3043
Fold 2: Accuracy=0.7862, Precision=0.4000, Recall=0.1379, F1=0.2051
Fold 3: Accuracy=0.7931, Precision=0.4800, Recall=0.4138, F1=0.4444
Fold 4: Accuracy=0.7310, Precision=0.3200, Recall=0.2667, F1=0.2909
Fold 5: Accuracy=0.7292, Precision=0.3438, Recall=0.3793, F1=0.3607

--- Ensemble Model Summary ---
Name                          : Stacking-XGB+EasyEnsemble
Description                   : Base[XGB,Easy],Meta[LogReg],Thresh0.45
Accuracy                      : 0.7638
Precision                     : 0.3911
Recall                        : 0.2878
F1 Score                      : 0.3211

CSV Row Format:
Stacking-XGB+EasyEnsemble,Base[XGB,Easy],Meta[LogReg],Thresh0.45,0.7638,0.3911,0.2878,0.3211


In [42]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

warnings.filterwarnings('ignore')

# --- Data prep ---
current_df = df_clean.copy()  # Make sure df_clean is defined already
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Column types
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# --- Base Learners ---
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    max_depth=4,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

eec = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42
)

# --- Meta Learner ---
meta_clf = LogisticRegression(C=1.0, class_weight='balanced', random_state=42)

# --- Stacking Classifier ---
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('eec', eec)
    ],
    final_estimator=meta_clf,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

# --- Final Pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', stacking_clf)
])

# --- Thresholds and Metrics ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.1, 0.5, 5)

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    best_metrics = {'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0, 'threshold': 0}
    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        if f1 > best_metrics['f1']:
            best_metrics = {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'threshold': thresh}

    accuracy_list.append(best_metrics['acc'])
    precision_list.append(best_metrics['prec'])
    recall_list.append(best_metrics['rec'])
    f1_list.append(best_metrics['f1'])

    print(f"Fold {fold}: Threshold={best_metrics['threshold']:.2f}, "
          f"Accuracy={best_metrics['acc']:.4f}, Precision={best_metrics['prec']:.4f}, "
          f"Recall={best_metrics['rec']:.4f}, F1={best_metrics['f1']:.4f}")

# --- Summary ---
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

model_name = 'Stacking-XGB+EEC'
model_desc = 'Hybrid-Stacking-XGB-EasyEnsemble-MetaLR'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Threshold=0.50, Accuracy=0.6690, Precision=0.2979, Recall=0.4828, F1=0.3684
Fold 2: Threshold=0.50, Accuracy=0.7310, Precision=0.3684, Recall=0.4828, F1=0.4179
Fold 3: Threshold=0.40, Accuracy=0.5448, Precision=0.2771, Recall=0.7931, F1=0.4107
Fold 4: Threshold=0.40, Accuracy=0.5517, Precision=0.2989, Recall=0.8667, F1=0.4444
Fold 5: Threshold=0.20, Accuracy=0.3611, Precision=0.2261, Recall=0.8966, F1=0.3611

--- Model Summary ---
Name                          : Stacking-XGB+EEC
Description                   : Hybrid-Stacking-XGB-EasyEnsemble-MetaLR
Accuracy                      : 0.5715
Precision                     : 0.2937
Recall                        : 0.7044
F1 Score                      : 0.4005


In [43]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

warnings.filterwarnings('ignore')

# --- Data prep ---
current_df = df_clean.copy()  # Make sure df_clean is defined already
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Column types
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# --- Base Learners ---
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    max_depth=4,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

eec = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42
)

# --- Meta Learner ---
meta_clf = LogisticRegression(C=1.0, class_weight='balanced', random_state=42)

# --- Stacking Classifier ---
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('eec', eec)
    ],
    final_estimator=meta_clf,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

# --- Final Pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', stacking_clf)
])

# --- Thresholds and Metrics ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.1, 0.5, 5)

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    best_metrics = {'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0, 'threshold': 0}
    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        if f1 > best_metrics['f1']:
            best_metrics = {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'threshold': thresh}

    accuracy_list.append(best_metrics['acc'])
    precision_list.append(best_metrics['prec'])
    recall_list.append(best_metrics['rec'])
    f1_list.append(best_metrics['f1'])

    print(f"Fold {fold}: Threshold={best_metrics['threshold']:.2f}, "
          f"Accuracy={best_metrics['acc']:.4f}, Precision={best_metrics['prec']:.4f}, "
          f"Recall={best_metrics['rec']:.4f}, F1={best_metrics['f1']:.4f}")

# --- Summary ---
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

model_name = 'Stacking-XGB+EEC'
model_desc = 'Hybrid-Stacking-XGB-EasyEnsemble-MetaLR'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Threshold=0.50, Accuracy=0.6690, Precision=0.2979, Recall=0.4828, F1=0.3684
Fold 2: Threshold=0.50, Accuracy=0.7310, Precision=0.3684, Recall=0.4828, F1=0.4179
Fold 3: Threshold=0.40, Accuracy=0.5448, Precision=0.2771, Recall=0.7931, F1=0.4107
Fold 4: Threshold=0.40, Accuracy=0.5517, Precision=0.2989, Recall=0.8667, F1=0.4444
Fold 5: Threshold=0.20, Accuracy=0.3611, Precision=0.2261, Recall=0.8966, F1=0.3611

--- Model Summary ---
Name                          : Stacking-XGB+EEC
Description                   : Hybrid-Stacking-XGB-EasyEnsemble-MetaLR
Accuracy                      : 0.5715
Precision                     : 0.2937
Recall                        : 0.7044
F1 Score                      : 0.4005


In [44]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.utils.class_weight import compute_class_weight

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer

import warnings
warnings.filterwarnings('ignore')

# ------------------ Data Setup ------------------
X = df_clean.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = df_clean['Risk Flag'].astype(int)

# ------------------ Class Weights for CatBoost ------------------
class_weights_array = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
catboost_weights = {int(cls): weight for cls, weight in zip(np.unique(y), class_weights_array)}

# ------------------ Preprocessor ------------------
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=np.number).columns.tolist()

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", StandardScaler(), num_cols)
])

# ------------------ Base Estimators + BayesSearchCV ------------------

# CatBoost
catboost = CatBoostClassifier(verbose=0, class_weights=catboost_weights)
catboost_search = BayesSearchCV(
    catboost,
    search_spaces={
        'depth': Integer(3, 8),
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'iterations': Integer(100, 300)
    },
    n_iter=10,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    random_state=42
)

# LightGBM
lgbm = LGBMClassifier(class_weight='balanced')
lgbm_search = BayesSearchCV(
    lgbm,
    search_spaces={
        'num_leaves': Integer(20, 60),
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'n_estimators': Integer(100, 300)
    },
    n_iter=10,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    random_state=42
)

# RBF-SVC
svc = SVC(probability=True, kernel='rbf', class_weight='balanced')
svc_search = BayesSearchCV(
    svc,
    search_spaces={
        'C': Real(0.1, 100, prior='log-uniform'),
        'gamma': Real(1e-4, 1e-1, prior='log-uniform')
    },
    n_iter=10,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    random_state=42
)

# ------------------ Stacking Classifier ------------------

stacking_clf = StackingClassifier(
    estimators=[
        ('catboost', catboost_search),
        ('lgbm', lgbm_search),
        ('svc', svc_search)
    ],
    final_estimator=LogisticRegression(class_weight='balanced'),
    passthrough=True,
    cv=3,
    n_jobs=-1
)

# Full pipeline
model = Pipeline([
    ("pre", preprocessor),
    ("stack", stacking_clf)
])

# ------------------ CV Evaluation ------------------

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# ------------------ Final Metrics ------------------

mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

model_name = 'Stacking-CatBoost+LGBM+SVC'
model_desc = 'BayesTuned Base, Logistic Meta, 5Fold Stratified'

print("\n--- Ensemble Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.6690, Precision=0.2889, Recall=0.4483, F1=0.3514
Fold 2: Accuracy=0.7379, Precision=0.3846, Recall=0.5172, F1=0.4412
Fold 3: Accuracy=0.6000, Precision=0.2769, Recall=0.6207, F1=0.3830
Fold 4: Accuracy=0.6207, Precision=0.2951, Recall=0.6000, F1=0.3956
Fold 5: Accuracy=0.5417, Precision=0.2063, Recall=0.4483, F1=0.2826

--- Ensemble Model Summary ---
Name                          : Stacking-CatBoost+LGBM+SVC
Description                   : BayesTuned Base, Logistic Meta, 5Fold Stratified
Accuracy                      : 0.6339
Precision                     : 0.2904
Recall                        : 0.5269
F1 Score                      : 0.3707


In [45]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from imblearn.ensemble import EasyEnsembleClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

# ========== Dataset ==========
# Copy clean DataFrame
current_df = df_clean.copy()

# Separate features and target
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# ========== Feature Selection ==========

# Step 1: Drop low-variance numeric features
selector = VarianceThreshold(threshold=0.01)
X_numeric = X[numeric_cols]
X_var = selector.fit_transform(X_numeric)
kept_low_var_cols = X_numeric.columns[selector.get_support()].tolist()

# Step 2: Drop high-VIF features (threshold > 10)
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

X_vif = X[kept_low_var_cols].copy()
while True:
    vif = calculate_vif(X_vif)
    max_vif = vif['VIF'].max()
    if max_vif > 10:
        drop_feat = vif.sort_values('VIF', ascending=False).iloc[0]['feature']
        X_vif.drop(columns=[drop_feat], inplace=True)
    else:
        break

# Final selected numeric columns
selected_numeric_cols = X_vif.columns.tolist()

# Keep only selected features for modeling
X = pd.concat([X[selected_numeric_cols], X[categorical_cols]], axis=1)

# ========== Preprocessing Pipeline ==========
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), selected_numeric_cols)
])

# Classifier
clf = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

# ========== Cross-Validation ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# ========== Aggregate Metrics ==========
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'EasyEnsembleClassifier'
model_desc = 'DropLowVar+DropHighVIF+Ensemble+Undersample+5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# ========== Save to CSV ==========
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.6828, Precision=0.2927, Recall=0.4138, F1=0.3429
Fold 2: Accuracy=0.7655, Precision=0.4194, Recall=0.4483, F1=0.4333
Fold 3: Accuracy=0.7034, Precision=0.3654, Recall=0.6552, F1=0.4691
Fold 4: Accuracy=0.7379, Precision=0.4259, Recall=0.7667, F1=0.5476
Fold 5: Accuracy=0.6528, Precision=0.2766, Recall=0.4483, F1=0.3421

--- Model Summary ---
Name                          : EasyEnsembleClassifier
Description                   : DropLowVar+DropHighVIF+Ensemble+Undersample+5Fold
Accuracy                      : 0.7085
Precision                     : 0.3560
Recall                        : 0.5464
F1 Score                      : 0.4270

CSV Row Format:
EasyEnsembleClassifier,DropLowVar+DropHighVIF+Ensemble+Undersample+5Fold,0.7085,0.3560,0.5464,0.4270


In [46]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from imblearn.ensemble import EasyEnsembleClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import TransformerMixin

import warnings
warnings.filterwarnings('ignore')

# ========== Dataset ==========
# Copy clean DataFrame
current_df = df_clean.copy()

# Separate features and target
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# ========== Step 1: Drop low-variance + high-VIF ==========

# Drop low-variance numeric features
selector = VarianceThreshold(threshold=0.01)
X_numeric = X[numeric_cols]
X_var = selector.fit_transform(X_numeric)
kept_low_var_cols = X_numeric.columns[selector.get_support()].tolist()

# Drop high-VIF features
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

X_vif = X[kept_low_var_cols].copy()
while True:
    vif = calculate_vif(X_vif)
    max_vif = vif['VIF'].max()
    if max_vif > 10:
        drop_feat = vif.sort_values('VIF', ascending=False).iloc[0]['feature']
        X_vif.drop(columns=[drop_feat], inplace=True)
    else:
        break

selected_numeric_cols = X_vif.columns.tolist()

# ========== Step 2: Select top 25 features using mutual_info_classif ==========

# Temporarily preprocess categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_cat_encoded = pd.DataFrame(
    ohe.fit_transform(X[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X.index
)

# Combine selected numeric and encoded categorical
X_combined = pd.concat([X[selected_numeric_cols], X_cat_encoded], axis=1)

# Compute mutual info
mi_scores = mutual_info_classif(X_combined, y, random_state=42)
mi_series = pd.Series(mi_scores, index=X_combined.columns)
top_25_features = mi_series.sort_values(ascending=False).head(25).index.tolist()

# Final selected feature set
X_final = X_combined[top_25_features]

# Identify final selected numeric and one-hot features
final_numeric = [col for col in top_25_features if col in selected_numeric_cols]
final_onehot = [col for col in top_25_features if col not in selected_numeric_cols]

# ========== Preprocessing Pipeline ==========
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), final_numeric)
], remainder='drop')

# Custom passthrough transformer for selected OHE columns
class SelectOHEColumns(TransformerMixin):
    def __init__(self, ohe_df, selected_cols):
        self.ohe_df = ohe_df
        self.selected_cols = selected_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return self.ohe_df.loc[X.index, self.selected_cols].values

# Combined transformer
from sklearn.pipeline import FeatureUnion

final_transformer = FeatureUnion([
    ("numeric", Pipeline([
        ("select", 'passthrough'),
        ("scale", StandardScaler())
    ])),
    ("onehot", SelectOHEColumns(X_cat_encoded, final_onehot))
])

# ========== Modeling ==========
clf = EasyEnsembleClassifier(n_estimators=10, random_state=42, n_jobs=-1)
pipeline = Pipeline([
    ('features', final_transformer),
    ('classifier', clf)
])

# ========== Cross-Validation ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_final, y), 1):
    X_train, X_val = X_final.iloc[train_idx], X_final.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# ========== Aggregate Metrics ==========
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'EasyEnsembleClassifier'
model_desc = 'LowVar+HighVIF+MI(top25)+5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# ========== Save to CSV ==========
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.6828, Precision=0.2927, Recall=0.4138, F1=0.3429
Fold 2: Accuracy=0.7724, Precision=0.4333, Recall=0.4483, F1=0.4407
Fold 3: Accuracy=0.7103, Precision=0.3725, Recall=0.6552, F1=0.4750
Fold 4: Accuracy=0.7310, Precision=0.4118, Recall=0.7000, F1=0.5185
Fold 5: Accuracy=0.6667, Precision=0.2889, Recall=0.4483, F1=0.3514

--- Model Summary ---
Name                          : EasyEnsembleClassifier
Description                   : LowVar+HighVIF+MI(top25)+5Fold
Accuracy                      : 0.7126
Precision                     : 0.3598
Recall                        : 0.5331
F1 Score                      : 0.4257

CSV Row Format:
EasyEnsembleClassifier,LowVar+HighVIF+MI(top25)+5Fold,0.7126,0.3598,0.5331,0.4257


In [47]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from imblearn.ensemble import EasyEnsembleClassifier, BalancedBaggingClassifier
from lightgbm import LGBMClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import TransformerMixin, BaseEstimator

import warnings
warnings.filterwarnings('ignore')

# ========== Dataset ==========
df = df_clean.copy()
X = df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = df['Risk Flag'].astype(int)

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# ========== Step 1: Drop Low-Variance + High-VIF ==========
selector = VarianceThreshold(threshold=0.01)
X_numeric = X[numeric_cols]
X_var = selector.fit_transform(X_numeric)
kept_low_var_cols = X_numeric.columns[selector.get_support()].tolist()

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

X_vif = X[kept_low_var_cols].copy()
while True:
    vif = calculate_vif(X_vif)
    if vif['VIF'].max() > 10:
        to_drop = vif.sort_values('VIF', ascending=False).iloc[0]['feature']
        X_vif.drop(columns=[to_drop], inplace=True)
    else:
        break

selected_numeric_cols = X_vif.columns.tolist()

# ========== Step 2: Top 25 via mutual_info_classif ==========
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_cat_encoded = pd.DataFrame(
    ohe.fit_transform(X[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X.index
)

X_combined = pd.concat([X[selected_numeric_cols], X_cat_encoded], axis=1)

mi_scores = mutual_info_classif(X_combined, y, random_state=42)
mi_series = pd.Series(mi_scores, index=X_combined.columns)
top_25_features = mi_series.sort_values(ascending=False).head(25).index.tolist()

X_final = X_combined[top_25_features]
final_numeric = [col for col in top_25_features if col in selected_numeric_cols]
final_onehot = [col for col in top_25_features if col not in selected_numeric_cols]

# ========== Transformer for Categorical Columns ==========
class SelectOHEColumns(BaseEstimator, TransformerMixin):
    def __init__(self, encoded_df, selected_cols):
        self.encoded_df = encoded_df
        self.selected_cols = selected_cols
    def fit(self, X, y=None): return self
    def transform(self, X): return self.encoded_df.loc[X.index, self.selected_cols].values

# ========== Feature Union ==========
final_transformer = FeatureUnion([
    ("numeric", Pipeline([
        ("scale", StandardScaler())
    ])),
    ("onehot", SelectOHEColumns(X_cat_encoded, final_onehot))
])

# ========== Models ==========
models = {
    'EasyEnsembleClassifier': EasyEnsembleClassifier(
        n_estimators=10,
        random_state=42,
        n_jobs=-1
    ),
    'BalancedBaggingClassifier_LGBM': BalancedBaggingClassifier(
        estimator=LGBMClassifier(
            n_estimators=100,
            learning_rate=0.05,
            class_weight='balanced',
            random_state=42
        ),
        n_estimators=10,
        sampling_strategy='auto',
        replacement=False,
        random_state=42,
        n_jobs=-1
    )
}

# ========== Cross-Validation + Logging ==========
csv_file = "risk_model_metrics.csv"
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rows = []

for model_name, clf in models.items():
    print(f"\n==== {model_name} ====")

    pipeline = Pipeline([
        ('features', final_transformer),
        ('clf', clf)
    ])

    acc_list, prec_list, rec_list, f1_list = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X_final, y), 1):
        X_train, X_val = X_final.iloc[train_idx], X_final.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)

        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        acc_list.append(acc)
        prec_list.append(prec)
        rec_list.append(rec)
        f1_list.append(f1)

        print(f"Fold {fold}: Acc={acc:.4f}, Prec={prec:.4f}, Rec={rec:.4f}, F1={f1:.4f}")

    mean_acc = np.mean(acc_list)
    mean_prec = np.mean(prec_list)
    mean_rec = np.mean(rec_list)
    mean_f1 = np.mean(f1_list)

    print("--- Aggregate ---")
    print(f"Accuracy:  {mean_acc:.4f}")
    print(f"Precision: {mean_prec:.4f}")
    print(f"Recall:    {mean_rec:.4f}")
    print(f"F1 Score:  {mean_f1:.4f}")

    model_desc = 'LowVar+HighVIF+MI(top25)+5Fold'
    print("\nCSV Row Format:")
    print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

    rows.append({
        'Name': model_name,
        'Desc': model_desc,
        'Accuracy': round(mean_acc, 4),
        'Precision': round(mean_prec, 4),
        'Recall': round(mean_rec, 4),
        'F1 Score': round(mean_f1, 4)
    })

# ========== Save All Results ==========
df_result = pd.DataFrame(rows)
df_result.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))



==== EasyEnsembleClassifier ====
Fold 1: Acc=0.6828, Prec=0.2927, Rec=0.4138, F1=0.3429
Fold 2: Acc=0.7724, Prec=0.4333, Rec=0.4483, F1=0.4407
Fold 3: Acc=0.7103, Prec=0.3725, Rec=0.6552, F1=0.4750
Fold 4: Acc=0.7310, Prec=0.4118, Rec=0.7000, F1=0.5185
Fold 5: Acc=0.6667, Prec=0.2889, Rec=0.4483, F1=0.3514
--- Aggregate ---
Accuracy:  0.7126
Precision: 0.3598
Recall:    0.5331
F1 Score:  0.4257

CSV Row Format:
EasyEnsembleClassifier,LowVar+HighVIF+MI(top25)+5Fold,0.7126,0.3598,0.5331,0.4257

==== BalancedBaggingClassifier_LGBM ====
Fold 1: Acc=0.6483, Prec=0.2500, Rec=0.3793, F1=0.3014
Fold 2: Acc=0.7103, Prec=0.3333, Rec=0.4483, F1=0.3824
Fold 3: Acc=0.7103, Prec=0.3774, Rec=0.6897, F1=0.4878
Fold 4: Acc=0.6690, Prec=0.3500, Rec=0.7000, F1=0.4667
Fold 5: Acc=0.6181, Prec=0.2679, Rec=0.5172, F1=0.3529
--- Aggregate ---
Accuracy:  0.6712
Precision: 0.3157
Recall:    0.5469
F1 Score:  0.3982

CSV Row Format:
BalancedBaggingClassifier_LGBM,LowVar+HighVIF+MI(top25)+5Fold,0.6712,0.3157,0.

In [48]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.ensemble import EasyEnsembleClassifier
import warnings

warnings.filterwarnings('ignore')

# ========== Input ==========
current_df = df_clean.copy()  # Assumes cleaned DataFrame already exists
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# ========== CV Setup ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.1, 0.5, 5)

# ========== Preprocessing ==========
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# ========== Model Components ==========
feature_selector = SelectFromModel(estimator=RandomForestClassifier(random_state=42), max_features=20)
base_model = EasyEnsembleClassifier(random_state=42, n_estimators=10)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('classifier', base_model)
])

# ========== Metrics ==========
accuracy_list, precision_list, recall_list, f1_list, threshold_list = [], [], [], [], []

# ========== Cross-Validation ==========
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    best = {'thresh': None, 'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0}

    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        if f1 > best['f1']:
            best = {'thresh': thresh, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    accuracy_list.append(best['acc'])
    precision_list.append(best['prec'])
    recall_list.append(best['rec'])
    f1_list.append(best['f1'])
    threshold_list.append(best['thresh'])

    print(f"Fold {fold}: Best Threshold={best['thresh']:.2f}, Acc={best['acc']:.4f}, "
          f"Prec={best['prec']:.4f}, Rec={best['rec']:.4f}, F1={best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(threshold_list)

model_name = 'EasyEnsembleClassifier+ThresholdTuning'
model_desc = 'EEC-ThresholdTuning-5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Avg Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Avg Precision':<30}: {mean_prec:.4f}")
print(f"{'Avg Recall':<30}: {mean_rec:.4f}")
print(f"{'Avg F1 Score':<30}: {mean_f1:.4f}")
print(f"{'Avg Best Threshold':<30}: {mean_thresh:.2f}")

# ========== Save CSV ==========
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4),
    'Threshold': round(mean_thresh, 2)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Best Threshold=0.40, Acc=0.4276, Prec=0.2500, Rec=0.9310, F1=0.3942
Fold 2: Best Threshold=0.50, Acc=0.6966, Prec=0.3469, Rec=0.5862, F1=0.4359
Fold 3: Best Threshold=0.50, Acc=0.6000, Prec=0.2899, Rec=0.6897, F1=0.4082
Fold 4: Best Threshold=0.50, Acc=0.6552, Prec=0.3571, Rec=0.8333, F1=0.5000
Fold 5: Best Threshold=0.50, Acc=0.5833, Prec=0.2754, Rec=0.6552, F1=0.3878

--- Model Summary ---
Name                          : EasyEnsembleClassifier+ThresholdTuning
Description                   : EEC-ThresholdTuning-5Fold
Avg Accuracy                  : 0.5925
Avg Precision                 : 0.3039
Avg Recall                    : 0.7391
Avg F1 Score                  : 0.4252
Avg Best Threshold            : 0.48


In [49]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

# ========== Input ==========
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# ========== Drop High VIF / Low Variance Columns (if known) ==========
# Optionally prefilter X here based on known analysis
# e.g., X = X.drop(columns=high_vif_cols)

# ========== CV Setup ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.arange(0.30, 0.71, 0.02)  # Finer sweep range

# ========== Preprocessing ==========
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# ========== Model Components ==========
feature_selector = SelectFromModel(estimator=RandomForestClassifier(random_state=42), max_features=20)
smote = SMOTE(random_state=42)
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('feature_selection', feature_selector),
    ('classifier', xgb_model)
])

# ========== Metrics ==========
accuracy_list, precision_list, recall_list, f1_list, threshold_list = [], [], [], [], []

# ========== Cross-Validation ==========
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    best = {'thresh': None, 'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0}

    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        if rec >= 0.85 and prec >= 0.5:  # Your hard target
            if f1 > best['f1']:
                best = {'thresh': thresh, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    # If no threshold met both conditions, pick best F1 instead
    if best['thresh'] is None:
        for thresh in thresholds:
            y_pred = (y_probs >= thresh).astype(int)
            acc = accuracy_score(y_val, y_pred)
            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred)
            f1 = f1_score(y_val, y_pred)
            if f1 > best['f1']:
                best = {'thresh': thresh, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    accuracy_list.append(best['acc'])
    precision_list.append(best['prec'])
    recall_list.append(best['rec'])
    f1_list.append(best['f1'])
    threshold_list.append(best['thresh'])

    print(f"Fold {fold}: Best Threshold={best['thresh']:.2f}, Acc={best['acc']:.4f}, "
          f"Prec={best['prec']:.4f}, Rec={best['rec']:.4f}, F1={best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(threshold_list)

model_name = 'XGBoost-SMOTE-FineTuned-ThresholdSweep'
model_desc = 'XGB+SMOTE+FineTuned+ThreshSweep(0.30–0.70)'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Avg Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Avg Precision':<30}: {mean_prec:.4f}")
print(f"{'Avg Recall':<30}: {mean_rec:.4f}")
print(f"{'Avg F1 Score':<30}: {mean_f1:.4f}")
print(f"{'Avg Best Threshold':<30}: {mean_thresh:.2f}")

# ========== Save CSV ==========
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4),
    'Threshold': round(mean_thresh, 2)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Best Threshold=0.38, Acc=0.7103, Prec=0.2903, Rec=0.3103, F1=0.3000
Fold 2: Best Threshold=0.34, Acc=0.7517, Prec=0.3704, Rec=0.3448, F1=0.3571
Fold 3: Best Threshold=0.62, Acc=0.7793, Prec=0.4400, Rec=0.3793, F1=0.4074
Fold 4: Best Threshold=0.30, Acc=0.6690, Prec=0.2857, Rec=0.4000, F1=0.3333
Fold 5: Best Threshold=0.32, Acc=0.6667, Prec=0.3208, Rec=0.5862, F1=0.4146

--- Model Summary ---
Name                          : XGBoost-SMOTE-FineTuned-ThresholdSweep
Description                   : XGB+SMOTE+FineTuned+ThreshSweep(0.30–0.70)
Avg Accuracy                  : 0.7154
Avg Precision                 : 0.3414
Avg Recall                    : 0.4041
Avg F1 Score                  : 0.3625
Avg Best Threshold            : 0.39


In [50]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

# ========== Input ==========
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# ========== CV Setup ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.05, 0.6, 12)  # More granular hit & try

# ========== Preprocessing ==========
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# ========== Model Components ==========
feature_selector = SelectFromModel(estimator=RandomForestClassifier(random_state=42), max_features=25)
smote = SMOTE(random_state=42)
xgb_model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.7,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('feature_selection', feature_selector),
    ('classifier', xgb_model)
])

# ========== Metrics ==========
accuracy_list, precision_list, recall_list, f1_list, threshold_list = [], [], [], [], []

# ========== Cross-Validation ==========
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    best = {'thresh': None, 'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0}

    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        if f1 > best['f1']:
            best = {'thresh': thresh, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    accuracy_list.append(best['acc'])
    precision_list.append(best['prec'])
    recall_list.append(best['rec'])
    f1_list.append(best['f1'])
    threshold_list.append(best['thresh'])

    print(f"Fold {fold}: Best Threshold={best['thresh']:.2f}, Acc={best['acc']:.4f}, "
          f"Prec={best['prec']:.4f}, Rec={best['rec']:.4f}, F1={best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(threshold_list)

model_name = 'XGBoost-HitAndTry-ThresholdSweep'
model_desc = 'XGB+SMOTE+WideThreshSweep+NoVIFDrop'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Avg Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Avg Precision':<30}: {mean_prec:.4f}")
print(f"{'Avg Recall':<30}: {mean_rec:.4f}")
print(f"{'Avg F1 Score':<30}: {mean_f1:.4f}")
print(f"{'Avg Best Threshold':<30}: {mean_thresh:.2f}")

# ========== Save CSV ==========
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4),
    'Threshold': round(mean_thresh, 2)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Best Threshold=0.15, Acc=0.6483, Prec=0.3167, Rec=0.6552, F1=0.4270
Fold 2: Best Threshold=0.05, Acc=0.5517, Prec=0.2907, Rec=0.8621, F1=0.4348
Fold 3: Best Threshold=0.50, Acc=0.7793, Prec=0.4595, Rec=0.5862, F1=0.5152
Fold 4: Best Threshold=0.20, Acc=0.6552, Prec=0.3529, Rec=0.8000, F1=0.4898
Fold 5: Best Threshold=0.10, Acc=0.5208, Prec=0.2727, Rec=0.8276, F1=0.4103

--- Model Summary ---
Name                          : XGBoost-HitAndTry-ThresholdSweep
Description                   : XGB+SMOTE+WideThreshSweep+NoVIFDrop
Avg Accuracy                  : 0.6311
Avg Precision                 : 0.3385
Avg Recall                    : 0.7462
Avg F1 Score                  : 0.4554
Avg Best Threshold            : 0.20


In [51]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import mutual_info_classif
import warnings

warnings.filterwarnings('ignore')

# ========== Input ==========
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# ========== Top 20 Features via MI ==========
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
X_enc = pd.get_dummies(X, columns=cat_cols)
mi_scores = mutual_info_classif(X_enc, y, discrete_features='auto', random_state=42)
mi_df = pd.Series(mi_scores, index=X_enc.columns).sort_values(ascending=False)
top20_features = mi_df.head(20).index.tolist()
X = X_enc[top20_features]

# ========== CV Setup ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.05, 0.6, 12)

# ========== Preprocessing ==========
preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), top20_features)
])

# ========== Model ==========
model = BalancedBaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=6, random_state=42),
    n_estimators=10,
    sampling_strategy='auto',
    replacement=False,
    random_state=42
)

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# ========== Metrics ==========
accuracy_list, precision_list, recall_list, f1_list, threshold_list = [], [], [], [], []

# ========== Cross-Validation ==========
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    best = {'thresh': None, 'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0}

    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        if f1 > best['f1']:
            best = {'thresh': thresh, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    accuracy_list.append(best['acc'])
    precision_list.append(best['prec'])
    recall_list.append(best['rec'])
    f1_list.append(best['f1'])
    threshold_list.append(best['thresh'])

    print(f"Fold {fold}: Best Threshold={best['thresh']:.2f}, Acc={best['acc']:.4f}, "
          f"Prec={best['prec']:.4f}, Rec={best['rec']:.4f}, F1={best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(threshold_list)

model_name = 'BaggingDT+Balanced+Top20+ThreshTuned'
model_desc = 'BaggingDT+Top20MI+ThreshSweep(0.05–0.6)'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Avg Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Avg Precision':<30}: {mean_prec:.4f}")
print(f"{'Avg Recall':<30}: {mean_rec:.4f}")
print(f"{'Avg F1 Score':<30}: {mean_f1:.4f}")
print(f"{'Avg Best Threshold':<30}: {mean_thresh:.2f}")

# ========== Save CSV ==========
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4),
    'Threshold': round(mean_thresh, 2)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Best Threshold=0.35, Acc=0.5862, Prec=0.2987, Rec=0.7931, F1=0.4340
Fold 2: Best Threshold=0.25, Acc=0.5931, Prec=0.3125, Rec=0.8621, F1=0.4587
Fold 3: Best Threshold=0.40, Acc=0.5655, Prec=0.2821, Rec=0.7586, F1=0.4112
Fold 4: Best Threshold=0.45, Acc=0.5931, Prec=0.3165, Rec=0.8333, F1=0.4587
Fold 5: Best Threshold=0.20, Acc=0.4167, Prec=0.2430, Rec=0.8966, F1=0.3824

--- Model Summary ---
Name                          : BaggingDT+Balanced+Top20+ThreshTuned
Description                   : BaggingDT+Top20MI+ThreshSweep(0.05–0.6)
Avg Accuracy                  : 0.5509
Avg Precision                 : 0.2905
Avg Recall                    : 0.8287
Avg F1 Score                  : 0.4290
Avg Best Threshold            : 0.33


In [59]:
import os
import numpy as np
import pandas as pd
import warnings
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore")

# ========== Load & Prepare ==========
current_df = df_clean.copy()
X = current_df.drop(columns=["Risk Flag"])
y = current_df["Risk Flag"].astype(int)

# ========== Column Diagnostics ==========
obj_cols = X.select_dtypes(include="object").columns


# ========== One-Hot Encode for MI ==========
X_enc = pd.get_dummies(X, columns=obj_cols)
mi_scores = mutual_info_classif(X_enc, y, discrete_features='auto', random_state=42)
mi_df = pd.Series(mi_scores, index=X_enc.columns).sort_values(ascending=False)
top20_features = mi_df.head(20).index.tolist()

# Keep only top 20
X_top20 = X_enc[top20_features]

# ========== CV Setup ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.05, 0.6, 12)

# ========== Pipeline ==========
preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), top20_features)
])

model = BalancedBaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=6, random_state=42),
    n_estimators=10,
    sampling_strategy='auto',
    replacement=False,
    random_state=42
)

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# ========== Metrics ==========
accuracy_list, precision_list, recall_list, f1_list, threshold_list = [], [], [], [], []

# ========== CV Loop ==========
for fold, (train_idx, val_idx) in enumerate(cv.split(X_top20, y), 1):
    X_train, X_val = X_top20.iloc[train_idx], X_top20.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    best = {'thresh': None, 'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0}

    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        if f1 > best['f1']:
            best = {'thresh': thresh, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    accuracy_list.append(best['acc'])
    precision_list.append(best['prec'])
    recall_list.append(best['rec'])
    f1_list.append(best['f1'])
    threshold_list.append(best['thresh'])

    print(f"Fold {fold}: Best Threshold = {best['thresh']:.2f}, "
          f"Acc = {best['acc']:.4f}, Prec = {best['prec']:.4f}, "
          f"Rec = {best['rec']:.4f}, F1 = {best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(threshold_list)

model_name = 'BaggingDT+Balanced+Top20+ThreshTuned'
model_desc = 'BaggingDT+Top20MI+ThreshSweep(0.05–0.6)'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Avg Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Avg Precision':<30}: {mean_prec:.4f}")
print(f"{'Avg Recall':<30}: {mean_rec:.4f}")
print(f"{'Avg F1 Score':<30}: {mean_f1:.4f}")
print(f"{'Avg Best Threshold':<30}: {mean_thresh:.2f}")

# ========== Save CSV ==========
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4),
    'Threshold': round(mean_thresh, 2)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Best Threshold = 0.35, Acc = 0.5862, Prec = 0.2987, Rec = 0.7931, F1 = 0.4340
Fold 2: Best Threshold = 0.25, Acc = 0.5931, Prec = 0.3125, Rec = 0.8621, F1 = 0.4587
Fold 3: Best Threshold = 0.40, Acc = 0.5655, Prec = 0.2821, Rec = 0.7586, F1 = 0.4112
Fold 4: Best Threshold = 0.45, Acc = 0.5931, Prec = 0.3165, Rec = 0.8333, F1 = 0.4587
Fold 5: Best Threshold = 0.20, Acc = 0.4167, Prec = 0.2430, Rec = 0.8966, F1 = 0.3824

--- Model Summary ---
Name                          : BaggingDT+Balanced+Top20+ThreshTuned
Description                   : BaggingDT+Top20MI+ThreshSweep(0.05–0.6)
Avg Accuracy                  : 0.5509
Avg Precision                 : 0.2905
Avg Recall                    : 0.8287
Avg F1 Score                  : 0.4290
Avg Best Threshold            : 0.33


In [60]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.feature_selection import mutual_info_classif
import warnings

warnings.filterwarnings("ignore")

# ========== Data ==========
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Encode categorical features
cat_cols = X.select_dtypes(include='object').columns.tolist()
X_enc = pd.get_dummies(X, columns=cat_cols)

# Mutual Information for top 20 features
mi_scores = mutual_info_classif(X_enc, y, discrete_features='auto', random_state=42)
top20_features = pd.Series(mi_scores, index=X_enc.columns).sort_values(ascending=False).head(20).index.tolist()
X = X_enc[top20_features]

# Preprocessing
preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), top20_features)
])

# Model
model = EasyEnsembleClassifier(random_state=42)

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# CV + Threshold Sweep
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.05, 0.6, 12)

accuracy_list, precision_list, recall_list, f1_list, threshold_list = [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    best = {'thresh': None, 'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0}

    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        if f1 > best['f1']:
            best = {'thresh': thresh, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    accuracy_list.append(best['acc'])
    precision_list.append(best['prec'])
    recall_list.append(best['rec'])
    f1_list.append(best['f1'])
    threshold_list.append(best['thresh'])

    print(f"Fold {fold}: Best Threshold = {best['thresh']:.2f}, Acc = {best['acc']:.4f}, "
          f"Prec = {best['prec']:.4f}, Rec = {best['rec']:.4f}, F1 = {best['f1']:.4f}")

# Summary
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(threshold_list)

model_name = 'EasyEnsemble-Top20-ThresholdTuned'
model_desc = f'Top20MI+ThreshTuned({round(mean_thresh, 2)})'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Avg Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Avg Precision':<30}: {mean_prec:.4f}")
print(f"{'Avg Recall':<30}: {mean_rec:.4f}")
print(f"{'Avg F1 Score':<30}: {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Best Threshold = 0.50, Acc = 0.6897, Prec = 0.3519, Rec = 0.6552, F1 = 0.4578
Fold 2: Best Threshold = 0.50, Acc = 0.7034, Prec = 0.3409, Rec = 0.5172, F1 = 0.4110
Fold 3: Best Threshold = 0.55, Acc = 0.7793, Prec = 0.4545, Rec = 0.5172, F1 = 0.4839
Fold 4: Best Threshold = 0.50, Acc = 0.6414, Prec = 0.3333, Rec = 0.7333, F1 = 0.4583
Fold 5: Best Threshold = 0.50, Acc = 0.6181, Prec = 0.2969, Rec = 0.6552, F1 = 0.4086

--- Model Summary ---
Name                          : EasyEnsemble-Top20-ThresholdTuned
Description                   : Top20MI+ThreshTuned(0.51)
Avg Accuracy                  : 0.6864
Avg Precision                 : 0.3555
Avg Recall                    : 0.6156
Avg F1 Score                  : 0.4439


In [62]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import mutual_info_classif
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.ensemble import EasyEnsembleClassifier

# ========== Input ==========

current_df = df_clean.copy()
X_raw = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

cat_cols = X_raw.select_dtypes(include='object').columns.tolist()
num_cols = X_raw.select_dtypes(include=['int64', 'float64']).columns.tolist()

# ========== One-Hot Encode Categorical ==========

X_encoded = pd.get_dummies(X_raw, columns=cat_cols)

# ========== Feature Selection: MI + Boruta ==========

# Mutual Information (Top 40)
mi_scores = mutual_info_classif(X_encoded, y, random_state=42)
mi_series = pd.Series(mi_scores, index=X_encoded.columns).sort_values(ascending=False)
top_mi_features = mi_series.head(40).index.tolist()
X_mi = X_encoded[top_mi_features]

# Boruta on top MI features
rf_for_boruta = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5, random_state=42)
boruta_selector = BorutaPy(rf_for_boruta, n_estimators='auto', random_state=42)
boruta_selector.fit(X_mi.values, y.values)

final_features = X_mi.columns[boruta_selector.support_].tolist()
X_final = X_mi[final_features]

# ========== Preprocessing ==========

preprocessor = ColumnTransformer([
    ("scale", StandardScaler(), final_features)
])

# ========== Model Pipeline ==========

model = EasyEnsembleClassifier(n_estimators=10, random_state=42)

pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("classifier", model)
])

# ========== CV Setup & Threshold Tuning ==========

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.1, 0.7, 13)

acc_list, prec_list, rec_list, f1_list, thresh_list = [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_final, y), 1):
    X_train, X_val = X_final.iloc[train_idx], X_final.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_prob = pipeline.predict_proba(X_val)[:, 1]

    best = {'f1': 0, 'thresh': 0.5, 'acc': 0, 'prec': 0, 'rec': 0}

    for t in thresholds:
        preds = (y_prob >= t).astype(int)
        acc = accuracy_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        rec = recall_score(y_val, preds)
        f1 = f1_score(y_val, preds)

        if f1 > best['f1']:
            best = {'thresh': t, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    print(f"Fold {fold}: Best Threshold = {best['thresh']:.2f}, Acc = {best['acc']:.4f}, "
          f"Prec = {best['prec']:.4f}, Rec = {best['rec']:.4f}, F1 = {best['f1']:.4f}")

    acc_list.append(best['acc'])
    prec_list.append(best['prec'])
    rec_list.append(best['rec'])
    f1_list.append(best['f1'])
    thresh_list.append(best['thresh'])

# ========== Summary ==========

model_name = "EEC-Boruta-MI40-ThresholdSweep"
model_desc = f"EasyEnsemble+BorutaMI40+ThreshSweep({round(np.mean(thresh_list), 2)})"

summary = {
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(np.mean(acc_list), 4),
    'Precision': round(np.mean(prec_list), 4),
    'Recall': round(np.mean(rec_list), 4),
    'F1 Score': round(np.mean(f1_list), 4),
}

print("\n--- Model Summary ---")
for k, v in summary.items():
    print(f"{k:<30}: {v}")

# ========== Save to CSV ==========

csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([summary])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Best Threshold = 0.50, Acc = 0.7310, Prec = 0.3864, Rec = 0.5862, F1 = 0.4658
Fold 2: Best Threshold = 0.50, Acc = 0.7310, Prec = 0.3750, Rec = 0.5172, F1 = 0.4348
Fold 3: Best Threshold = 0.50, Acc = 0.6828, Prec = 0.3651, Rec = 0.7931, F1 = 0.5000
Fold 4: Best Threshold = 0.50, Acc = 0.6483, Prec = 0.3521, Rec = 0.8333, F1 = 0.4950
Fold 5: Best Threshold = 0.50, Acc = 0.6111, Prec = 0.3043, Rec = 0.7241, F1 = 0.4286

--- Model Summary ---
Name                          : EEC-Boruta-MI40-ThresholdSweep
Desc                          : EasyEnsemble+BorutaMI40+ThreshSweep(0.5)
Accuracy                      : 0.6808
Precision                     : 0.3566
Recall                        : 0.6908
F1 Score                      : 0.4648


In [63]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import mutual_info_classif
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.pipeline import Pipeline as ImbPipeline

# ========== Input ==========

current_df = df_clean.copy()
X_raw = current_df.drop(columns=['Risk Flag'])
y = current_df['Risk Flag'].astype(int)

cat_cols = X_raw.select_dtypes(include='object').columns.tolist()
num_cols = X_raw.select_dtypes(include=['int64', 'float64']).columns.tolist()

# ========== One-Hot Encode ==========

X_encoded = pd.get_dummies(X_raw, columns=cat_cols)

# ========== MI + Boruta Feature Selection ==========

mi_scores = mutual_info_classif(X_encoded, y, random_state=42)
mi_series = pd.Series(mi_scores, index=X_encoded.columns).sort_values(ascending=False)
top_mi = mi_series.head(40).index.tolist()
X_mi = X_encoded[top_mi]

rf_boruta = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5, random_state=42)
boruta_selector = BorutaPy(rf_boruta, n_estimators='auto', random_state=42)
boruta_selector.fit(X_mi.values, y.values)

final_features = X_mi.columns[boruta_selector.support_].tolist()
X_final = X_mi[final_features]

# ========== Preprocessor ==========

preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), final_features)
])

# ========== Balanced Bagging + LightGBM ==========

base_lgbm = LGBMClassifier(
    learning_rate=0.05,
    n_estimators=100,
    max_depth=6,
    class_weight='balanced',
    random_state=42,
    verbosity=-1
)

bbc = BalancedBaggingClassifier(
    estimator=base_lgbm,
    n_estimators=10,
    max_samples=0.8,
    replacement=False,
    random_state=42
)

pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("classifier", bbc)
])

# ========== CV + Threshold Tuning ==========

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.linspace(0.1, 0.7, 13)

acc_list, prec_list, rec_list, f1_list, thresh_list = [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_final, y), 1):
    X_train, X_val = X_final.iloc[train_idx], X_final.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_prob = pipeline.predict_proba(X_val)[:, 1]

    best = {'f1': 0, 'thresh': 0.5, 'acc': 0, 'prec': 0, 'rec': 0}

    for t in thresholds:
        preds = (y_prob >= t).astype(int)
        acc = accuracy_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        rec = recall_score(y_val, preds)
        f1 = f1_score(y_val, preds)

        if f1 > best['f1']:
            best = {'thresh': t, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1}

    print(f"Fold {fold}: Best Threshold = {best['thresh']:.2f}, Acc = {best['acc']:.4f}, "
          f"Prec = {best['prec']:.4f}, Rec = {best['rec']:.4f}, F1 = {best['f1']:.4f}")

    acc_list.append(best['acc'])
    prec_list.append(best['prec'])
    rec_list.append(best['rec'])
    f1_list.append(best['f1'])
    thresh_list.append(best['thresh'])

# ========== Summary ==========

model_name = "BalancedBagging-LGBM-BorutaMI"
model_desc = f"LGBM+BorutaMI+ThreshSweep({round(np.mean(thresh_list), 2)})"

summary = {
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(np.mean(acc_list), 4),
    'Precision': round(np.mean(prec_list), 4),
    'Recall': round(np.mean(rec_list), 4),
    'F1 Score': round(np.mean(f1_list), 4),
}

print("\n--- Model Summary ---")
for k, v in summary.items():
    print(f"{k:<30}: {v}")

# ========== Save to CSV ==========

csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([summary])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Best Threshold = 0.55, Acc = 0.7241, Prec = 0.3590, Rec = 0.4828, F1 = 0.4118
Fold 2: Best Threshold = 0.40, Acc = 0.6690, Prec = 0.3333, Rec = 0.6552, F1 = 0.4419
Fold 3: Best Threshold = 0.65, Acc = 0.7931, Prec = 0.4848, Rec = 0.5517, F1 = 0.5161
Fold 4: Best Threshold = 0.55, Acc = 0.7586, Prec = 0.4359, Rec = 0.5667, F1 = 0.4928
Fold 5: Best Threshold = 0.45, Acc = 0.6042, Prec = 0.3056, Rec = 0.7586, F1 = 0.4356

--- Model Summary ---
Name                          : BalancedBagging-LGBM-BorutaMI
Desc                          : LGBM+BorutaMI+ThreshSweep(0.52)
Accuracy                      : 0.7098
Precision                     : 0.3837
Recall                        : 0.603
F1 Score                      : 0.4596


In [65]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from imblearn.ensemble import EasyEnsembleClassifier, BalancedBaggingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ========== Data Setup ==========
current_df = df_clean.copy()
X = current_df.drop(columns=["Risk Flag"])
y = current_df["Risk Flag"].astype(int)

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# One-hot encode categorical features
X_encoded = pd.get_dummies(X, columns=cat_cols)

# ========== Feature Selection (Boruta + MI fallback) ==========
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

mi_top = X_encoded.columns[np.argsort(mutual_info_classif(X_encoded, y))[-40:]].tolist()
rf_selector = RandomForestClassifier(n_jobs=-1, class_weight='balanced', random_state=42)
boruta_selector = BorutaPy(rf_selector, n_estimators='auto', random_state=42)
boruta_selector.fit(X_encoded[mi_top].values, y.values)
selected_features = [feat for feat, keep in zip(mi_top, boruta_selector.support_) if keep]
X_selected = X_encoded[selected_features]

# ========== Base Models ==========
eec = EasyEnsembleClassifier(random_state=42)
balanced_lgbm = BalancedBaggingClassifier(
    estimator=LGBMClassifier(
        n_estimators=300, learning_rate=0.03,
        max_depth=6, colsample_bytree=0.8, min_child_samples=20, random_state=42
    ),
    n_estimators=10, random_state=42
)
xgb = XGBClassifier(
    n_estimators=200, max_depth=4, learning_rate=0.05,
    scale_pos_weight=4, use_label_encoder=False, eval_metric="logloss", random_state=42
)

# ========== Voting Ensemble ==========
ensemble = VotingClassifier(
    estimators=[
        ("balanced_lgbm", balanced_lgbm),
        ("eec", eec),
        ("xgb", xgb)
    ],
    voting="soft",
    weights=[3, 2, 1],
    n_jobs=-1
)

# ========== CV + Threshold Sweep ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresh_range = np.linspace(0.1, 0.7, 13)

acc_list, prec_list, rec_list, f1_list, best_thresh_list = [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_selected, y), 1):
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    ensemble.fit(X_train, y_train)
    y_proba = ensemble.predict_proba(X_val)[:, 1]

    best = {"thresh": None, "acc": 0, "prec": 0, "rec": 0, "f1": 0}
    for thresh in thresh_range:
        preds = (y_proba >= thresh).astype(int)
        acc = accuracy_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        rec = recall_score(y_val, preds)
        f1 = f1_score(y_val, preds)

        if f1 > best["f1"]:
            best = {"thresh": thresh, "acc": acc, "prec": prec, "rec": rec, "f1": f1}

    acc_list.append(best["acc"])
    prec_list.append(best["prec"])
    rec_list.append(best["rec"])
    f1_list.append(best["f1"])
    best_thresh_list.append(best["thresh"])

    print(f"Fold {fold}: Best Threshold = {best['thresh']:.2f}, Acc = {best['acc']:.4f}, "
          f"Prec = {best['prec']:.4f}, Rec = {best['rec']:.4f}, F1 = {best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(best_thresh_list)

model_name = "VotingEnsemble-LGBM-EEC-XGB"
model_desc = f"WeightedSoftVoting[3,2,1]+ThreshSweep({mean_thresh:.2f})"

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Desc':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# ========== Save CSV ==========
result_path = "risk_model_metrics.csv"
row = pd.DataFrame([{
    "Name": model_name,
    "Desc": model_desc,
    "Accuracy": round(mean_acc, 4),
    "Precision": round(mean_prec, 4),
    "Recall": round(mean_rec, 4),
    "F1 Score": round(mean_f1, 4),
}])

row.to_csv(result_path, mode="a", index=False, header=not os.path.exists(result_path))


Fold 1: Best Threshold = 0.45, Acc = 0.6759, Prec = 0.3500, Rec = 0.7241, F1 = 0.4719
Fold 2: Best Threshold = 0.55, Acc = 0.7793, Prec = 0.4483, Rec = 0.4483, F1 = 0.4483
Fold 3: Best Threshold = 0.45, Acc = 0.6552, Prec = 0.3478, Rec = 0.8276, F1 = 0.4898
Fold 4: Best Threshold = 0.60, Acc = 0.7310, Prec = 0.4043, Rec = 0.6333, F1 = 0.4935
Fold 5: Best Threshold = 0.40, Acc = 0.5625, Prec = 0.2927, Rec = 0.8276, F1 = 0.4324

--- Model Summary ---
Name                          : VotingEnsemble-LGBM-EEC-XGB
Desc                          : WeightedSoftVoting[3,2,1]+ThreshSweep(0.49)
Accuracy                      : 0.6808
Precision                     : 0.3686
Recall                        : 0.6922
F1 Score                      : 0.4672


In [68]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from imblearn.ensemble import EasyEnsembleClassifier, BalancedBaggingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# ========== Load and Prepare Data ==========
current_df = df_clean.copy()
X = current_df.drop(columns=["Risk Flag"])
y = current_df["Risk Flag"].astype(int)

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# One-hot encode
X_encoded = pd.get_dummies(X, columns=cat_cols)

# ========== Feature Selection ==========
mi_top = X_encoded.columns[np.argsort(mutual_info_classif(X_encoded, y))[-40:]].tolist()
rf_selector = RandomForestClassifier(n_jobs=-1, class_weight='balanced', random_state=42)
boruta_selector = BorutaPy(rf_selector, n_estimators='auto', random_state=42, verbose=0)
boruta_selector.fit(X_encoded[mi_top].values, y.values)
selected_features = [feat for feat, keep in zip(mi_top, boruta_selector.support_) if keep]
X_selected = X_encoded[selected_features]

# ========== Define Base Models ==========
eec = EasyEnsembleClassifier(random_state=42)

balanced_lgbm = BalancedBaggingClassifier(
    estimator=LGBMClassifier(
        n_estimators=300, learning_rate=0.03,
        max_depth=6, colsample_bytree=0.8,
        min_child_samples=20, random_state=42
    ),
    n_estimators=10, random_state=42
)

xgb = XGBClassifier(
    n_estimators=200, max_depth=4, learning_rate=0.05,
    scale_pos_weight=4, use_label_encoder=False,
    eval_metric="logloss", random_state=42
)

# ========== Voting Ensemble ==========
ensemble = VotingClassifier(
    estimators=[
        ("balanced_lgbm", balanced_lgbm),
        ("eec", eec),
        ("xgb", xgb)
    ],
    voting="soft",
    weights=[3, 2, 1],
    n_jobs=-1
)

# ========== Cross-Validation and Threshold Sweep ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresh_range = np.linspace(0.1, 0.7, 13)

acc_list, prec_list, rec_list, f1_list, best_thresh_list = [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_selected, y), 1):
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    ensemble.fit(X_train, y_train)
    y_proba = ensemble.predict_proba(X_val)[:, 1]

    best = {"thresh": None, "acc": 0, "prec": 0, "rec": 0, "f1": 0}
    for thresh in thresh_range:
        preds = (y_proba >= thresh).astype(int)
        acc = accuracy_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        rec = recall_score(y_val, preds)
        f1 = f1_score(y_val, preds)

        if f1 > best["f1"]:
            best = {"thresh": thresh, "acc": acc, "prec": prec, "rec": rec, "f1": f1}

    acc_list.append(best["acc"])
    prec_list.append(best["prec"])
    rec_list.append(best["rec"])
    f1_list.append(best["f1"])
    best_thresh_list.append(best["thresh"])

    print(f"Fold {fold}: Best Threshold = {best['thresh']:.2f}, Acc = {best['acc']:.4f}, "
          f"Prec = {best['prec']:.4f}, Rec = {best['rec']:.4f}, F1 = {best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(best_thresh_list)

model_name = "VotingEnsemble-LGBM-EEC-XGB"
model_desc = f"WeightedSoftVoting[3,2,1]+ThreshSweep({mean_thresh:.2f})"

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Desc':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# ========== Save to CSV ==========
result_path = "risk_model_metrics.csv"
row = pd.DataFrame([{
    "Name": model_name,
    "Desc": model_desc,
    "Accuracy": round(mean_acc, 4),
    "Precision": round(mean_prec, 4),
    "Recall": round(mean_rec, 4),
    "F1 Score": round(mean_f1, 4),
}])

# Append safely
if os.path.exists(result_path):
    existing = pd.read_csv(result_path)
    combined = pd.concat([existing, row], ignore_index=True)
    combined.drop_duplicates(subset=["Name", "Desc"], keep="last").to_csv(result_path, index=False)
else:
    row.to_csv(result_path, index=False)


Fold 1: Best Threshold = 0.35, Acc = 0.5931, Prec = 0.2973, Rec = 0.7586, F1 = 0.4272
Fold 2: Best Threshold = 0.45, Acc = 0.7034, Prec = 0.3542, Rec = 0.5862, F1 = 0.4416
Fold 3: Best Threshold = 0.50, Acc = 0.6966, Prec = 0.3684, Rec = 0.7241, F1 = 0.4884
Fold 4: Best Threshold = 0.45, Acc = 0.6207, Prec = 0.3377, Rec = 0.8667, F1 = 0.4860
Fold 5: Best Threshold = 0.35, Acc = 0.5278, Prec = 0.2857, Rec = 0.8966, F1 = 0.4333

--- Model Summary ---
Name                          : VotingEnsemble-LGBM-EEC-XGB
Desc                          : WeightedSoftVoting[3,2,1]+ThreshSweep(0.42)
Accuracy                      : 0.6283
Precision                     : 0.3287
Recall                        : 0.7664
F1 Score                      : 0.4553


In [69]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier, EasyEnsembleClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.feature_selection import mutual_info_classif

# ========== Data Setup ==========
current_df = df_clean.copy()
X = current_df.drop(columns=["Risk Flag"])
y = current_df["Risk Flag"].astype(int)

cat_cols = X.select_dtypes(include='object').columns.tolist()
X_encoded = pd.get_dummies(X, columns=cat_cols)

# ========== Feature Selection ==========
mi_top = X_encoded.columns[np.argsort(mutual_info_classif(X_encoded, y))[-40:]].tolist()
rf_selector = RandomForestClassifier(n_jobs=-1, class_weight='balanced', random_state=42)
boruta_selector = BorutaPy(rf_selector, n_estimators='auto', random_state=42)
boruta_selector.fit(X_encoded[mi_top].values, y.values)
selected_features = [feat for feat, keep in zip(mi_top, boruta_selector.support_) if keep]
X_selected = X_encoded[selected_features]

# ========== Base Models ==========
catboost = CatBoostClassifier(depth=8, learning_rate=0.05, iterations=300, verbose=0, random_state=42)
eec = EasyEnsembleClassifier(n_estimators=10, random_state=42)
bbc_lgbm = BalancedBaggingClassifier(
    estimator=LGBMClassifier(n_estimators=300, learning_rate=0.03, max_depth=6,
                             colsample_bytree=0.8, min_child_samples=20, random_state=42),
    n_estimators=10, random_state=42
)

# ========== Meta Model ==========
meta_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# ========== Stacking Ensemble ==========
stacking_model = StackingClassifier(
    estimators=[
        ("catboost", catboost),
        ("eec", eec),
        ("bbc_lgbm", bbc_lgbm)
    ],
    final_estimator=meta_model,
    cv=5,
    stack_method='predict_proba',
    passthrough=True,
    n_jobs=-1
)

# ========== CV + Threshold Sweep ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresh_range = np.linspace(0.2, 0.7, 11)

acc_list, prec_list, rec_list, f1_list, best_thresh_list = [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_selected, y), 1):
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    stacking_model.fit(X_train, y_train)
    y_proba = stacking_model.predict_proba(X_val)[:, 1]

    best = {"thresh": None, "acc": 0, "prec": 0, "rec": 0, "f1": 0}
    for thresh in thresh_range:
        preds = (y_proba >= thresh).astype(int)
        acc = accuracy_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        rec = recall_score(y_val, preds)
        f1 = f1_score(y_val, preds)

        if f1 > best["f1"]:
            best = {"thresh": thresh, "acc": acc, "prec": prec, "rec": rec, "f1": f1}

    acc_list.append(best["acc"])
    prec_list.append(best["prec"])
    rec_list.append(best["rec"])
    f1_list.append(best["f1"])
    best_thresh_list.append(best["thresh"])

    print(f"Fold {fold}: Best Threshold = {best['thresh']:.2f}, Acc = {best['acc']:.4f}, "
          f"Prec = {best['prec']:.4f}, Rec = {best['rec']:.4f}, F1 = {best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(best_thresh_list)

model_name = "Stacking-CatBBC-EEC"
model_desc = f"CatBoost+BBC-LGBM+EEC | Meta=LogReg | ThreshSweep({mean_thresh:.2f})"

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Desc':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# ========== Save CSV ==========
result_path = "risk_model_metrics.csv"
row = pd.DataFrame([{
    "Name": model_name,
    "Desc": model_desc,
    "Accuracy": round(mean_acc, 4),
    "Precision": round(mean_prec, 4),
    "Recall": round(mean_rec, 4),
    "F1 Score": round(mean_f1, 4),
}])

row.to_csv(result_path, mode="a", index=False, header=not os.path.exists(result_path))


Fold 1: Best Threshold = 0.50, Acc = 0.7103, Prec = 0.3556, Rec = 0.5517, F1 = 0.4324
Fold 2: Best Threshold = 0.45, Acc = 0.7034, Prec = 0.3542, Rec = 0.5862, F1 = 0.4416
Fold 3: Best Threshold = 0.55, Acc = 0.7241, Prec = 0.3922, Rec = 0.6897, F1 = 0.5000
Fold 4: Best Threshold = 0.55, Acc = 0.7103, Prec = 0.4000, Rec = 0.8000, F1 = 0.5333
Fold 5: Best Threshold = 0.40, Acc = 0.5764, Prec = 0.3095, Rec = 0.8966, F1 = 0.4602

--- Model Summary ---
Name                          : Stacking-CatBBC-EEC
Desc                          : CatBoost+BBC-LGBM+EEC | Meta=LogReg | ThreshSweep(0.49)
Accuracy                      : 0.6849
Precision                     : 0.3623
Recall                        : 0.7048
F1 Score                      : 0.4735


In [70]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingClassifier
from imblearn.ensemble import EasyEnsembleClassifier, BalancedBaggingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from boruta import BorutaPy

# ========== Data Setup ==========
current_df = df_clean.copy()
X = current_df.drop(columns=["Risk Flag"])
y = current_df["Risk Flag"].astype(int)

cat_cols = X.select_dtypes(include='object').columns.tolist()
X_encoded = pd.get_dummies(X, columns=cat_cols)
mi_top = X_encoded.columns[np.argsort(mutual_info_classif(X_encoded, y))[-40:]].tolist()

# Boruta selection on top 40 MI
rf_selector = RandomForestClassifier(n_jobs=-1, class_weight='balanced', random_state=42)
boruta_selector = BorutaPy(rf_selector, n_estimators='auto', random_state=42)
boruta_selector.fit(X_encoded[mi_top].values, y.values)
selected_features = [feat for feat, keep in zip(mi_top, boruta_selector.support_) if keep]
X_selected = X_encoded[selected_features]

# ========== Base Models ==========
cat = CatBoostClassifier(
    iterations=300, depth=6, learning_rate=0.05,
    bagging_temperature=1.0, verbose=0, random_state=42
)
bbc = BalancedBaggingClassifier(
    estimator=LGBMClassifier(
        n_estimators=300, learning_rate=0.03,
        max_depth=6, colsample_bytree=0.8,
        min_child_samples=20, random_state=42
    ),
    n_estimators=10, random_state=42
)
eec = EasyEnsembleClassifier(random_state=42)

# ========== Stacking ==========
stack = StackingClassifier(
    estimators=[
        ("cat", cat),
        ("bbc", bbc),
        ("eec", eec)
    ],
    final_estimator=LogisticRegression(max_iter=500),
    stack_method="predict_proba",
    passthrough=False,
    n_jobs=-1
)

# ========== CV + Threshold Sweep ==========
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresh_range = np.linspace(0.3, 0.7, 9)

acc_list, prec_list, rec_list, f1_list, best_thresh_list = [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_selected, y), 1):
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    stack.fit(X_train, y_train)
    y_proba = stack.predict_proba(X_val)[:, 1]

    best = {"thresh": None, "acc": 0, "prec": 0, "rec": 0, "f1": 0}
    for thresh in thresh_range:
        preds = (y_proba >= thresh).astype(int)
        acc = accuracy_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        rec = recall_score(y_val, preds)
        f1 = f1_score(y_val, preds)

        if f1 > best["f1"]:
            best = {"thresh": thresh, "acc": acc, "prec": prec, "rec": rec, "f1": f1}

    acc_list.append(best["acc"])
    prec_list.append(best["prec"])
    rec_list.append(best["rec"])
    f1_list.append(best["f1"])
    best_thresh_list.append(best["thresh"])

    print(f"Fold {fold}: Best Threshold = {best['thresh']:.2f}, Acc = {best['acc']:.4f}, "
          f"Prec = {best['prec']:.4f}, Rec = {best['rec']:.4f}, F1 = {best['f1']:.4f}")

# ========== Summary ==========
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)
mean_thresh = np.mean(best_thresh_list)

model_name = "Stacking-CatBBC-EEC"
model_desc = f"CatBoost+BBC-LGBM+EEC | Meta=LogReg | ThreshSweep({mean_thresh:.2f})"

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Desc':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# ========== Save to CSV ==========
result_path = "risk_model_metrics.csv"
row = pd.DataFrame([{
    "Name": model_name,
    "Desc": model_desc,
    "Accuracy": round(mean_acc, 4),
    "Precision": round(mean_prec, 4),
    "Recall": round(mean_rec, 4),
    "F1 Score": round(mean_f1, 4),
}])
row.to_csv(result_path, mode="a", index=False, header=not os.path.exists(result_path))


Fold 1: Best Threshold = 0.30, Acc = 0.7241, Prec = 0.2800, Rec = 0.2414, F1 = 0.2593
Fold 2: Best Threshold = 0.30, Acc = 0.7862, Prec = 0.4500, Rec = 0.3103, F1 = 0.3673
Fold 3: Best Threshold = 0.30, Acc = 0.7379, Prec = 0.3902, Rec = 0.5517, F1 = 0.4571
Fold 4: Best Threshold = 0.30, Acc = 0.7379, Prec = 0.4048, Rec = 0.5667, F1 = 0.4722
Fold 5: Best Threshold = 0.30, Acc = 0.6597, Prec = 0.2727, Rec = 0.4138, F1 = 0.3288

--- Model Summary ---
Name                          : Stacking-CatBBC-EEC
Desc                          : CatBoost+BBC-LGBM+EEC | Meta=LogReg | ThreshSweep(0.30)
Accuracy                      : 0.7292
Precision                     : 0.3595
Recall                        : 0.4168
F1 Score                      : 0.3769
