In [1]:
import pandas as pd
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Ensure the path to the DEModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/de_model"))
from de_handler import DEModelHandler  

# Ensure the path to the FSDModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/fsd_model"))
from fsd_handler import FSDModelHandler  

# Ensure the path to the Math3ModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/math3_model"))
from math3_handler import Math3ModelHandler  

# Ensure the path to the PythonModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/python_model"))
from python_handler import PythonModelHandler  

df = pd.read_csv("../dataset/train_dataset.csv")

# Drop the irrelevant, data leak columns
df_clean = df.drop(
    columns=[
        "Student ID",
        "Mentor-1",
        "Mentor-2",
        "Mentor-3",
        "Roll-2",
        "Roll-3",
        "Math-3 Theory",
        "DE Practical",
        "FSD Theory",
        "FSD Practical",
        "Python Theory",
        "Python Practical",
        "Communication Theory",
        "Law Theory",
    ]
)

# columns for Semester 1 core subjects
sem1_columns = [
    "Math-1 Theory",
    "Physics Theory",
    "Java-1 Theory",
    "Software Engineering Theory",
]

# Calculate Semester 1 Percentage as the average of core subject scores
# scores are numerical and out of 100
df_clean["Sem 1 Percentage"] = df_clean[sem1_columns].mean(axis=1).round(2)

# columns for Semester 2 core subjects
sem2_columns = [
    "Math-2 Theory",
    "Data Structures using Java Theory",
    "DBMS Theory",
    "Fundamental of Electronics and Electrical Theory",
    "Java-2 Theory",
]

# Calculate Semester 2 Percentage as the average of core subject scores
# scores are numerical and out of 100
df_clean["Sem 2 Percentage"] = df_clean[sem2_columns].mean(axis=1).round(2)

# Rename columns Div-1, Div-2, Div-3 to Section-1, Section-2, Section-3
df_clean = df_clean.rename(
    columns={"Div-1": "Section-1", "Div-2": "Section-2", "Div-3": "Section-3"}
)

# Transform values in Section-1, Section-2, Section-3 to keep only the first character
# Thus we get Only Department
for section in ["Section-1", "Section-2", "Section-3"]:
    df_clean[section] = df_clean[section].str[0]

# adding DE predicted column
preprocessor = DEModelHandler()
fe_de = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/de_model/de_model.joblib",
    return_type="df"
)

# Add the predicted DE Theory marks to df_clean
df_clean["Predicted DE Theory"] = fe_de["Predicted DE Theory"]


# adding FSD predicted column
preprocessor = FSDModelHandler()
fe_fsd = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/fsd_model/fsd_model.joblib",
    return_type="df"
)

# Add the predicted FSD Theory marks to df_clean
df_clean["Predicted FSD Theory"] = fe_fsd["Predicted FSD Theory"]


# adding Math3 predicted column
preprocessor = Math3ModelHandler()
fe_math3 = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/math3_model/math3_model.joblib",
    return_type="df"
)

# Add the predicted Math3 Theory marks to df_clean
df_clean["Predicted Math-3 Theory"] = fe_math3["Predicted Math-3 Theory"]


# adding Python predicted column
preprocessor = PythonModelHandler()
fe_python = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/python_model/python_model.joblib",
    return_type="df"
)

# Add the predicted Python Theory marks to df_clean
df_clean["Predicted Python Theory"] = fe_python["Predicted Python Theory"]

#  Calculate predicted Semester 3 percentage (mean of 4 predicted subject marks)
sem3_subjects = [
    "Predicted Math-3 Theory",
    "Predicted DE Theory",
    "Predicted FSD Theory",
    "Predicted Python Theory",
]

df_clean["Predicted Sem 3 Percentage"] = df_clean[sem3_subjects].mean(axis=1).round(2)

df_clean["Sem 1 Percentile"] = df_clean["Sem 1 Percentage"].rank(pct=True) * 100
df_clean["Sem 2 Percentile"] = df_clean["Sem 2 Percentage"].rank(pct=True) * 100
df_clean["Predicted Sem 3 Percentile"] = df_clean["Predicted Sem 3 Percentage"].rank(pct=True) * 100

# Round for consistency
df_clean[["Sem 1 Percentile", "Sem 2 Percentile", "Predicted Sem 3 Percentile"]] = df_clean[
    ["Sem 1 Percentile", "Sem 2 Percentile", "Predicted Sem 3 Percentile"]
].round(2)

df_clean["Predicted Percentile Drop"] = (
    df_clean["Sem 2 Percentile"] - df_clean["Predicted Sem 3 Percentile"]
).round(2)

df_clean["Predicted Risk Flag"] = df_clean["Predicted Percentile Drop"] > 10

# Columns for Semester 3 core theory subjects
sem3_columns = [
    "Math-3 Theory",
    "DE Theory",
    "FSD Theory",
    "Python Theory",
]

# Calculate Semester 3 Total as the sum of core subject scores
df["Sem 3 Percentage"] = (df[sem3_columns].sum(axis=1) / 4).round(2)

df_clean["Sem 3 Percentile"] = df["Sem 3 Percentage"].rank(pct=True) * 100

df_clean["Percentile Drop"] = (
    df_clean["Sem 2 Percentile"] - df_clean["Sem 3 Percentile"]
).round(2)

df_clean["Risk Flag"] = df_clean["Percentile Drop"] > 10

columns_to_drop = [
    "Sem 3 Percentile",
    "Percentile Drop"
]

df_clean.drop(columns=columns_to_drop, inplace=True)

# After all operations on df_clean are complete, drop other DataFrames
df = None
fe_de = None
fe_fsd = None
fe_math3 = None
fe_python = None

print(df_clean.head())

  Gender Religion Branch Section-1 Section-2 Section-3  Roll-1  Math-1 Theory  \
0      M    Hindu     CE         D         D         A     350             47   
1      F    Hindu    CST         B         B         D      18             84   
2      F    Hindu   AIML         A         A         C      23             74   
3      M    Hindu    CST         B         B         D     212             55   
4      M    Hindu    CST         B         B         D     208             38   

   Physics Theory  Physics Practical  ...  Predicted FSD Theory  \
0              48                 75  ...             72.266535   
1              83                 81  ...             87.523458   
2              85                 86  ...             89.409752   
3              69                 82  ...             79.807055   
4              59                 74  ...             56.474296   

   Predicted Math-3 Theory  Predicted Python Theory  \
0                56.352210                71.642156   


# Dummy

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

# Target and features
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# DummyClassifier – always predicts the most frequent class
dummy = DummyClassifier(strategy='most_frequent')

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'DummyClassifier-MostFreq'
model_desc = 'Baseline-MostFrequent-5Fold'

# Print formatted summary
print("\n--- Baseline Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV (append row, create file if not exists)
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to file (header only if file doesn't exist)
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 2: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 3: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 4: Accuracy=0.7931, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 5: Accuracy=0.7986, Precision=0.0000, Recall=0.0000, F1=0.0000

--- Baseline Model Summary ---
Name                          : DummyClassifier-MostFreq
Description                   : Baseline-MostFrequent-5Fold
Accuracy                      : 0.7983
Precision                     : 0.0000
Recall                        : 0.0000
F1 Score                      : 0.0000

CSV Row Format:
DummyClassifier-MostFreq,Baseline-MostFrequent-5Fold,0.7983,0.0000,0.0000,0.0000


# Logistic Regression

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)  # convert bool to 0/1

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Full pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LogisticRegression(class_weight='balanced', max_iter=1000))
])

# Step 5: 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Step 6: Loop through CV
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'LogisticRegression-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Print: CSV-style with labels and formatting
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.8207, Precision=0.5385, Recall=0.7241, F1=0.6176
Fold 2: Accuracy=0.8552, Precision=0.6176, Recall=0.7241, F1=0.6667
Fold 3: Accuracy=0.7931, Precision=0.4889, Recall=0.7586, F1=0.5946
Fold 4: Accuracy=0.8069, Precision=0.5217, Recall=0.8000, F1=0.6316
Fold 5: Accuracy=0.8125, Precision=0.5250, Recall=0.7241, F1=0.6087

--- Average Metrics Summary ---
Name                          : LogisticRegression-Balanced
Description                   : OneHot+Scaler+5Fold-Stratified
Accuracy                      : 0.8177
Precision                     : 0.5383
Recall                        : 0.7462
F1 Score                      : 0.6238

CSV Row Format:
LogisticRegression-Balanced,OneHot+Scaler+5Fold-Stratified,0.8177,0.5383,0.7462,0.6238


# DecisionTreeClassifier

In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'DecisionTreeClassifier-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- DecisionTreeClassifier Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.5000,  Recall=0.4138, F1=0.4528
Fold 2: Accuracy=0.8414, Precision=0.7143,  Recall=0.3448, F1=0.4651
Fold 3: Accuracy=0.7517, Precision=0.4103,  Recall=0.5517, F1=0.4706
Fold 4: Accuracy=0.7862, Precision=0.4848,  Recall=0.5333, F1=0.5079
Fold 5: Accuracy=0.8403, Precision=0.6250,  Recall=0.5172, F1=0.5660

--- DecisionTreeClassifier Summary ---
Mean Accuracy : 0.8039
Mean Precision: 0.5469
Mean Recall   : 0.4722
Mean F1 Score : 0.4925

CSV Row Format:
DecisionTreeClassifier-Balanced,OneHot+Scaler+5Fold-Stratified,0.8039,0.5469,0.4722,0.4925


In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Recall-tuned DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

# Step 5: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 6: Custom threshold
threshold = 0.35

# Step 7: Cross-validation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)

    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 8: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 9: Model info
model_name = 'DecisionTree-RecallTuned'
model_desc = f'Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold'

# Console output
print("\n--- DecisionTree_Recall_Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.6828, Precision=0.3455,  Recall=0.6552, F1=0.4524
Fold 2: Accuracy=0.8138, Precision=0.5357,  Recall=0.5172, F1=0.5263
Fold 3: Accuracy=0.7034, Precision=0.3750,  Recall=0.7241, F1=0.4941
Fold 4: Accuracy=0.7517, Precision=0.4348,  Recall=0.6667, F1=0.5263
Fold 5: Accuracy=0.7222, Precision=0.3878,  Recall=0.6552, F1=0.4872

--- DecisionTree_Recall_Tuned Summary ---
Mean Accuracy : 0.7348
Mean Precision: 0.4157
Mean Recall   : 0.6437
Mean F1 Score : 0.4973

CSV Row Format:
DecisionTree-RecallTuned,Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold,0.7348,0.4157,0.6437,0.4973


In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Flexible (deep) DecisionTreeClassifier
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        max_depth=None,             # no limit
        min_samples_split=2,        # fine splits
        min_samples_leaf=1,         # small leaves allowed
        random_state=42
    ))
])

# Step 4: Cross-validation config
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 5: Threshold
threshold = 0.25  # aggressive threshold to maximize recall

# Step 6: CV loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Aggregate results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 8: Metadata
model_name = 'DecisionTree-MaxRecall'
model_desc = 'Depth=None|Split=2|Leaf=1|Thresh=0.25|5Fold'

print("\n--- DecisionTree_MaxRecall Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 9: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.5000,  Recall=0.4138, F1=0.4528
Fold 2: Accuracy=0.8414, Precision=0.7143,  Recall=0.3448, F1=0.4651
Fold 3: Accuracy=0.7517, Precision=0.4103,  Recall=0.5517, F1=0.4706
Fold 4: Accuracy=0.7862, Precision=0.4848,  Recall=0.5333, F1=0.5079
Fold 5: Accuracy=0.8403, Precision=0.6250,  Recall=0.5172, F1=0.5660

--- DecisionTree_MaxRecall Summary ---
Mean Accuracy : 0.8039
Mean Precision: 0.5469
Mean Recall   : 0.4722
Mean F1 Score : 0.4925

CSV Row Format:
DecisionTree-MaxRecall,Depth=None|Split=2|Leaf=1|Thresh=0.25|5Fold,0.8039,0.5469,0.4722,0.4925


In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SMOTE setup
smote = SMOTE(random_state=42)

# Step 5: Recall-tuned DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

# Step 6: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 7: Custom threshold
threshold = 0.35

# Step 8: Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE to training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)

    # Fit the model on SMOTE-resampled data
    pipeline.named_steps['model'].fit(X_train_smote, y_train_smote)

    # Transform validation data
    X_val_preprocessed = preprocessor.transform(X_val)
    y_proba = pipeline.named_steps['model'].predict_proba(X_val_preprocessed)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 10: Model info
model_name = 'DecisionTree-SMOTE'
model_desc = f'Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold|SMOTE'

# Console output
print("\n--- DecisionTree_SMOTE Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 11: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7586, Precision=0.4400, Recall=0.7586, F1=0.5570
Fold 2: Accuracy=0.7517, Precision=0.4255, Recall=0.6897, F1=0.5263
Fold 3: Accuracy=0.7241, Precision=0.3922, Recall=0.6897, F1=0.5000
Fold 4: Accuracy=0.8069, Precision=0.5200, Recall=0.8667, F1=0.6500
Fold 5: Accuracy=0.7500, Precision=0.4340, Recall=0.7931, F1=0.5610

--- DecisionTree_SMOTE Summary ---
Mean Accuracy : 0.7583
Mean Precision: 0.4423
Mean Recall   : 0.7595
Mean F1 Score : 0.5589

CSV Row Format:
DecisionTree-SMOTE,Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold|SMOTE,0.7583,0.4423,0.7595,0.5589


In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SMOTE setup with adjusted sampling strategy
smote = SMOTE(sampling_strategy=0.8, random_state=42)

# Step 5: Recall-optimized DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=3,
        random_state=42
    ))
])

# Step 6: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 7: Lowered threshold for higher recall
threshold = 0.25

# Step 8: Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE to training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)

    # Fit the model on SMOTE-resampled data
    pipeline.named_steps['model'].fit(X_train_smote, y_train_smote)

    # Transform validation data
    X_val_preprocessed = preprocessor.transform(X_val)
    y_proba = pipeline.named_steps['model'].predict_proba(X_val_preprocessed)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 10: Model info
model_name = 'DecisionTree-SMOTE-RecallOptimized'
model_desc = f'Thresh=0.25|Depth=6|Split=10|Leaf=3|5Fold|SMOTE_0.8'

# Console output
print("\n--- DecisionTree_SMOTE_RecallOptimized Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 11: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7931, Precision=0.4878, Recall=0.6897, F1=0.5714
Fold 2: Accuracy=0.7586, Precision=0.4167, Recall=0.5172, F1=0.4615
Fold 3: Accuracy=0.7103, Precision=0.3818, Recall=0.7241, F1=0.5000
Fold 4: Accuracy=0.7241, Precision=0.4107, Recall=0.7667, F1=0.5349
Fold 5: Accuracy=0.8125, Precision=0.5238, Recall=0.7586, F1=0.6197

--- DecisionTree_SMOTE_RecallOptimized Summary ---
Mean Accuracy : 0.7597
Mean Precision: 0.4442
Mean Recall   : 0.6913
Mean F1 Score : 0.5375

CSV Row Format:
DecisionTree-SMOTE-RecallOptimized,Thresh=0.25|Depth=6|Split=10|Leaf=3|5Fold|SMOTE_0.8,0.7597,0.4442,0.6913,0.5375


# RandomForestClassifier 

In [9]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline with Random Forest
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'RandomForestClassifier-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- RandomForestClassifier Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8138, Precision=0.7500,  Recall=0.1034, F1=0.1818
Fold 2: Accuracy=0.8276, Precision=0.8333,  Recall=0.1724, F1=0.2857
Fold 3: Accuracy=0.8345, Precision=0.8571,  Recall=0.2069, F1=0.3333
Fold 4: Accuracy=0.8345, Precision=0.8000,  Recall=0.2667, F1=0.4000
Fold 5: Accuracy=0.8403, Precision=0.8750,  Recall=0.2414, F1=0.3784

--- RandomForestClassifier Summary ---
Mean Accuracy : 0.8301
Mean Precision: 0.8231
Mean Recall   : 0.1982
Mean F1 Score : 0.3158

CSV Row Format:
RandomForestClassifier-Balanced,OneHot+Scaler+5Fold-Stratified,0.8301,0.8231,0.1982,0.3158


In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)  # convert bool to 0/1

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Full pipeline with SMOTE + RandomForest
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    ))
])

# Step 5: 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

threshold = 0.3  # Custom threshold to maximize recall

# Step 6: Loop through CV
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]  # Get probability for class 1

    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'RandomForest-SMOTE-Threshold0.3'
model_desc = 'OneHot+Scaler+SMOTE+RF+Threshold=0.3'

# Print: CSV-style with labels and formatting
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.7586, Precision=0.4348, Recall=0.6897, F1=0.5333
Fold 2: Accuracy=0.7724, Precision=0.4545, Recall=0.6897, F1=0.5479
Fold 3: Accuracy=0.6690, Precision=0.3582, Recall=0.8276, F1=0.5000
Fold 4: Accuracy=0.6552, Precision=0.3649, Recall=0.9000, F1=0.5192
Fold 5: Accuracy=0.6875, Precision=0.3621, Recall=0.7241, F1=0.4828

--- Average Metrics Summary ---
Name                          : RandomForest-SMOTE-Threshold0.3
Description                   : OneHot+Scaler+SMOTE+RF+Threshold=0.3
Accuracy                      : 0.7085
Precision                     : 0.3949
Recall                        : 0.7662
F1 Score                      : 0.5167

CSV Row Format:
RandomForest-SMOTE-Threshold0.3,OneHot+Scaler+SMOTE+RF+Threshold=0.3,0.7085,0.3949,0.7662,0.5167


# XGBoost

In [11]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: XGBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=np.sum(y == 0) / np.sum(y == 1),  # Handles class imbalance
        use_label_encoder=False,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'XGBoost-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8414, Precision=0.6500,  Recall=0.4483, F1=0.5306
Fold 2: Accuracy=0.8759, Precision=0.7619,  Recall=0.5517, F1=0.6400
Fold 3: Accuracy=0.8138, Precision=0.5278,  Recall=0.6552, F1=0.5846
Fold 4: Accuracy=0.8414, Precision=0.6061,  Recall=0.6667, F1=0.6349
Fold 5: Accuracy=0.8958, Precision=0.7917,  Recall=0.6552, F1=0.7170

--- XGBoost Summary ---
Mean Accuracy : 0.8536
Mean Precision: 0.6675
Mean Recall   : 0.5954
Mean F1 Score : 0.6214

CSV Row Format:
XGBoost-Balanced,OneHot+Scaler+5Fold-Stratified,0.8536,0.6675,0.5954,0.6214


In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline with XGBoost + SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=4,  # 80:20 class balance
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 5: Stratified 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 6: Metrics storage
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Threshold for classification
threshold = 0.25

# Step 7: CV loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 8: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'XGBoost-SMOTE-Threshold0.25'
model_desc = 'OneHot+Scaler+SMOTE+XGB+Threshold=0.25'

# Step 9: Print metrics
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.8690, Precision=0.6250, Recall=0.8621, F1=0.7246
Fold 2: Accuracy=0.8621, Precision=0.6452, Recall=0.6897, F1=0.6667
Fold 3: Accuracy=0.7586, Precision=0.4400, Recall=0.7586, F1=0.5570
Fold 4: Accuracy=0.8345, Precision=0.5750, Recall=0.7667, F1=0.6571
Fold 5: Accuracy=0.8125, Precision=0.5238, Recall=0.7586, F1=0.6197

--- Average Metrics Summary ---
Name                          : XGBoost-SMOTE-Threshold0.25
Description                   : OneHot+Scaler+SMOTE+XGB+Threshold=0.25
Accuracy                      : 0.8273
Precision                     : 0.5618
Recall                        : 0.7671
F1 Score                      : 0.6450

CSV Row Format:
XGBoost-SMOTE-Threshold0.25,OneHot+Scaler+SMOTE+XGB+Threshold=0.25,0.8273,0.5618,0.7671,0.6450


In [13]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Pipeline with SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        random_state=42
    ))
])

# Step 4: Hyperparameter Optimization
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

# Step 5: Dynamic Threshold Tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
thresholds = np.linspace(0.1, 0.9, 9)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    # Dynamic threshold tuning for recall
    recalls = [recall_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(recalls)]

    y_pred = (y_proba >= best_threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Best Threshold={best_threshold:.2f}")

# Step 6: Aggregate Results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Summary
model_name = 'XGBoost-SMOTE-FineTuned'
model_desc = f'OptimizedParams|DynamicThresh={best_threshold:.2f}|OneHot+Scaler'

print("\n--- Fine-Tuned XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.01, 'model__max_depth': 6, 'model__min_child_weight': 3, 'model__n_estimators': 300, 'model__subsample': 0.8}
Fold 1: Accuracy=0.6207, Precision=0.3415, Recall=0.9655, F1=0.5045, Best Threshold=0.10
Fold 2: Accuracy=0.6138, Precision=0.3333, Recall=0.9310, F1=0.4909, Best Threshold=0.10
Fold 3: Accuracy=0.5586, Precision=0.3034, Recall=0.9310, F1=0.4576, Best Threshold=0.10
Fold 4: Accuracy=0.5448, Precision=0.3125, Recall=1.0000, F1=0.4762, Best Threshold=0.10
Fold 5: Accuracy=0.5000, Precision=0.2828, Recall=0.9655, F1=0.4375, Best Threshold=0.10

--- Fine-Tuned XGBoost Summary ---
Mean Accuracy : 0.5676
Mean Precision: 0.3147
Mean Recall   : 0.9586
Mean F1 Score : 0.4733


In [14]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Pipeline with SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        random_state=42
    ))
])

# Step 4: Hyperparameter Optimization
param_grid = {
    'model__n_estimators': [200, 300, 400],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__scale_pos_weight': [4, 5, 6]  # Adjusted for imbalance
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='balanced_accuracy',  # Balances true positive/negative rates
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

# Step 5: Dynamic Threshold Tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
thresholds = np.linspace(0.1, 0.9, 9)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    # Dynamic threshold tuning for F1
    f1_scores = [f1_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(f1_scores)]

    y_pred = (y_proba >= best_threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Best Threshold={best_threshold:.2f}")

# Step 6: Aggregate Results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Summary
model_name = 'XGBoost-SMOTE-FineTuned-Balanced'
model_desc = f'OptimizedParams|DynamicThresh={best_threshold:.2f}|OneHot+Scaler'

print("\n--- Fine-Tuned XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 8, 'model__min_child_weight': 5, 'model__n_estimators': 300, 'model__scale_pos_weight': 4, 'model__subsample': 0.8}
Fold 1: Accuracy=0.9241, Precision=0.8462, Recall=0.7586, F1=0.8000, Best Threshold=0.60
Fold 2: Accuracy=0.8759, Precision=0.6897, Recall=0.6897, F1=0.6897, Best Threshold=0.40
Fold 3: Accuracy=0.8207, Precision=0.5405, Recall=0.6897, F1=0.6061, Best Threshold=0.60
Fold 4: Accuracy=0.8552, Precision=0.6452, Recall=0.6667, F1=0.6557, Best Threshold=0.70
Fold 5: Accuracy=0.8681, Precision=0.6786, Recall=0.6552, F1=0.6667, Best Threshold=0.70

--- Fine-Tuned XGBoost Summary ---
Mean Accuracy : 0.8688
Mean Precision: 0.6800
Mean Recall   : 0.6920
Mean F1 Score : 0.6836


# LightGBM

In [15]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline (optimized to suppress warnings)
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        min_split_gain=0.01,
        min_child_samples=20,
        min_data_in_leaf=20,
        subsample=0.8,
        colsample_bytree=0.8,
        verbose=-1,              # suppress LightGBM internal logs
        random_state=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'LightGBM-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified+VerboseOff'

# Console summary
print("\n--- LightGBM Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8828, Precision=0.8000,  Recall=0.5517, F1=0.6531
Fold 2: Accuracy=0.8690, Precision=0.7273,  Recall=0.5517, F1=0.6275
Fold 3: Accuracy=0.8138, Precision=0.5294,  Recall=0.6207, F1=0.5714
Fold 4: Accuracy=0.8483, Precision=0.6333,  Recall=0.6333, F1=0.6333
Fold 5: Accuracy=0.8681, Precision=0.6786,  Recall=0.6552, F1=0.6667

--- LightGBM Summary ---
Mean Accuracy : 0.8564
Mean Precision: 0.6737
Mean Recall   : 0.6025
Mean F1 Score : 0.6304

CSV Row Format:
LightGBM-Balanced,OneHot+Scaler+5Fold-Stratified+VerboseOff,0.8564,0.6737,0.6025,0.6304


In [16]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Integer, Real

# Load your real df_clean before this step
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(objective='binary', class_weight='balanced', verbose=-1, random_state=42))
])

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

param_space = {
    'model__n_estimators': Integer(100, 500),
    'model__max_depth': Integer(3, 12),
    'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'model__min_child_samples': Integer(10, 100),
    'model__min_split_gain': Real(0.0, 0.2),
    'model__subsample': Real(0.6, 1.0),
    'model__colsample_bytree': Real(0.6, 1.0)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring=scoring,
    refit='recall',
    n_iter=40,
    cv=cv,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

opt.fit(X, y)

best_model = opt.best_estimator_
cv_results = cross_validate(best_model, X, y, cv=cv, scoring=scoring)

print("Final Tuned LightGBM Model Scores:")
print(f"Accuracy : {np.mean(cv_results['test_accuracy']):.4f}")
print(f"Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"Recall   : {np.mean(cv_results['test_recall']):.4f}")
print(f"F1 Score : {np.mean(cv_results['test_f1']):.4f}")
print("Best Parameters:", opt.best_params_)


Final Tuned LightGBM Model Scores:
Accuracy : 0.7459
Precision: 0.4328
Recall   : 0.8283
F1 Score : 0.5676
Best Parameters: OrderedDict({'model__colsample_bytree': 1.0, 'model__learning_rate': 0.01, 'model__max_depth': 12, 'model__min_child_samples': 100, 'model__min_split_gain': 0.2, 'model__n_estimators': 100, 'model__subsample': 0.8664183933116096})


In [19]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    precision_recall_curve
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        learning_rate=0.01,
        max_depth=12,
        min_child_samples=100,
        min_split_gain=0.2,
        n_estimators=100,
        subsample=0.8664,
        colsample_bytree=1.0,
        verbose=-1,
        random_state=42
    ))
])

# Step 5: Cross-validation predictions
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_probs = cross_val_predict(pipeline, X, y, cv=cv, method='predict_proba')[:, 1]
y_true = y.copy()  # true labels for all folds

# Step 6: Find optimal threshold
precisions, recalls, thresholds = precision_recall_curve(y_true, y_probs)
valid = [(p, r, t) for p, r, t in zip(precisions, recalls, thresholds) if r >= 0.85 and p > 0.50]

if valid:
    best_prec, best_rec, best_thresh = max(valid, key=lambda x: 2*x[0]*x[1]/(x[0]+x[1]))
else:
    best_thresh = 0.5  # fallback
    best_prec = precision_score(y_true, y_probs >= best_thresh, zero_division=0)
    best_rec = recall_score(y_true, y_probs >= best_thresh)
    best_f1 = f1_score(y_true, y_probs >= best_thresh)
    print("No threshold met all conditions. Using default 0.5.")

# Step 7: Final metrics at optimal threshold
y_pred_final = (y_probs >= best_thresh).astype(int)
final_acc = accuracy_score(y_true, y_pred_final)
final_prec = precision_score(y_true, y_pred_final, zero_division=0)
final_rec = recall_score(y_true, y_pred_final)
final_f1 = f1_score(y_true, y_pred_final)

# Step 8: Print results
print("\n--- Threshold-Tuned LightGBM Results ---")
print(f"Threshold   : {best_thresh:.4f}")
print(f"Accuracy    : {final_acc:.4f}")
print(f"Precision   : {final_prec:.4f}")
print(f"Recall      : {final_rec:.4f}")
print(f"F1 Score    : {final_f1:.4f}")

# Step 9: Save to CSV
model_name = 'LightGBM-Tuned-Threshold'
model_desc = 'BayesOpt+Threshold@{:.4f}'.format(best_thresh)
csv_file = "risk_model_metrics.csv"

result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(final_acc, 4),
    'Precision': round(final_prec, 4),
    'Recall': round(final_rec, 4),
    'F1 Score': round(final_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


No threshold met all conditions. Using default 0.5.

--- Threshold-Tuned LightGBM Results ---
Threshold   : 0.5000
Accuracy    : 0.7459
Precision   : 0.4321
Recall      : 0.8288
F1 Score    : 0.5681


In [20]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline with tuned parameters
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        n_estimators=200,           # Increased to allow more learning
        max_depth=8,                # Slightly deeper trees
        learning_rate=0.05,         # Lower for better convergence
        min_split_gain=0.01,
        min_child_samples=10,       # Lowered to capture smaller patterns
        min_data_in_leaf=10,        # Lowered to reduce overfitting
        subsample=0.8,
        colsample_bytree=0.7,      # Slightly reduced to increase diversity
        scale_pos_weight=3,         # Increase to prioritize positive class (tune based on imbalance)
        verbose=-1,
        random_state=42
    ))
])

# Step 5: Cross-validation with threshold tuning
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Get probability scores for threshold tuning
    y_prob = pipeline.predict_proba(X_val)[:, 1]

    # Find optimal threshold for recall >= 0.85
    precisions, recalls, thresholds = precision_recall_curve(y_val, y_prob)
    threshold = thresholds[np.argmax(recalls >= 0.85)] if np.any(recalls >= 0.85) else 0.5

    # Apply threshold to predictions
    y_pred = (y_prob >= threshold).astype(int)

    # Calculate metrics
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Threshold={threshold:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'LightGBM-Tuned-HighRecall'
model_desc = 'OneHot+Scaler+5Fold-Stratified+ThresholdTuned+VerboseOff'

# Console summary
print("\n--- LightGBM Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0000
Fold 2: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0000
Fold 3: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0000
Fold 4: Accuracy=0.2069, Precision=0.2069, Recall=1.0000, F1=0.3429, Threshold=0.0001
Fold 5: Accuracy=0.2014, Precision=0.2014, Recall=1.0000, F1=0.3353, Threshold=0.0000

--- LightGBM Tuned Summary ---
Mean Accuracy : 0.2017
Mean Precision: 0.2017
Mean Recall   : 1.0000
Mean F1 Score : 0.3356

CSV Row Format:
LightGBM-Tuned-HighRecall,OneHot+Scaler+5Fold-Stratified+ThresholdTuned+VerboseOff,0.2017,0.2017,1.0000,0.3356


In [21]:
import os
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Classifier wrapper for threshold tuning
class ThresholdLGBMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, **params):
        self.model = LGBMClassifier(**params)
        self.threshold = 0.5

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        probas = self.model.predict_proba(X)[:, 1]
        return (probas >= self.threshold).astype(int)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

# Step 5: Objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 80),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "class_weight": "balanced",
        "random_state": 42,
        "verbose": -1
    }

    model = ThresholdLGBMClassifier(**params)
    pipeline = Pipeline([('prep', preprocessor), ('clf', model)])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    recalls = []

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        probas = pipeline.predict_proba(X_val)[:, 1]

        # Find best threshold to maximize recall >= 0.85
        best_recall, best_thresh = 0, 0.5
        for thresh in np.arange(0.3, 0.8, 0.02):
            preds = (probas >= thresh).astype(int)
            rec = recall_score(y_val, preds)
            if rec > best_recall:
                best_recall, best_thresh = rec, thresh

        model.threshold = best_thresh
        preds = (probas >= best_thresh).astype(int)

        recalls.append(recall_score(y_val, preds))

    return np.mean(recalls)

# Step 6: Tune with Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)
best_params = study.best_trial.params

# Step 7: Final Evaluation
model = ThresholdLGBMClassifier(**best_params)
pipeline = Pipeline([('prep', preprocessor), ('clf', model)])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    probas = pipeline.predict_proba(X_val)[:, 1]

    # Best threshold for this fold
    best_thresh, best_f1 = 0.5, 0
    for thresh in np.arange(0.3, 0.8, 0.01):
        preds = (probas >= thresh).astype(int)
        rec = recall_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        f1_val = f1_score(y_val, preds)
        if rec >= 0.85 and prec > 0.5 and f1_val > best_f1:
            best_f1, best_thresh = f1_val, thresh

    model.threshold = best_thresh
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Threshold={best_thresh:.2f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

print("\n--- Final LightGBM Optimized ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# CSV logging
model_name = 'LightGBM-Optuna-Threshold'
model_desc = 'Optuna+ThresholdTuning+5Fold'
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


[I 2025-06-30 18:56:04,711] A new study created in memory with name: no-name-ea097395-081d-45b5-9c72-69a950a23895
Best trial: 0. Best value: 0.623218:   2%|▏         | 1/50 [00:01<01:02,  1.27s/it]

[I 2025-06-30 18:56:05,984] Trial 0 finished with value: 0.6232183908045976 and parameters: {'n_estimators': 274, 'learning_rate': 0.12943432035516703, 'max_depth': 7, 'num_leaves': 37, 'min_child_samples': 41, 'subsample': 0.7726104962620636, 'colsample_bytree': 0.7055135162673605}. Best is trial 0 with value: 0.6232183908045976.


Best trial: 0. Best value: 0.623218:   4%|▍         | 2/50 [00:02<01:02,  1.31s/it]

[I 2025-06-30 18:56:07,324] Trial 1 finished with value: 0.6022988505747126 and parameters: {'n_estimators': 171, 'learning_rate': 0.1254065378413808, 'max_depth': 7, 'num_leaves': 64, 'min_child_samples': 14, 'subsample': 0.8845647153000895, 'colsample_bytree': 0.63654892338312}. Best is trial 0 with value: 0.6232183908045976.


Best trial: 2. Best value: 0.704828:   6%|▌         | 3/50 [00:03<00:49,  1.06s/it]

[I 2025-06-30 18:56:08,079] Trial 2 finished with value: 0.7048275862068966 and parameters: {'n_estimators': 119, 'learning_rate': 0.141743005637849, 'max_depth': 3, 'num_leaves': 41, 'min_child_samples': 34, 'subsample': 0.8528944368742704, 'colsample_bytree': 0.7339201716544633}. Best is trial 2 with value: 0.7048275862068966.


Best trial: 2. Best value: 0.704828:   8%|▊         | 4/50 [00:04<00:51,  1.13s/it]

[I 2025-06-30 18:56:09,316] Trial 3 finished with value: 0.636551724137931 and parameters: {'n_estimators': 120, 'learning_rate': 0.08963186433686471, 'max_depth': 9, 'num_leaves': 64, 'min_child_samples': 16, 'subsample': 0.9295288542951307, 'colsample_bytree': 0.628843182173922}. Best is trial 2 with value: 0.7048275862068966.


Best trial: 2. Best value: 0.704828:  10%|█         | 5/50 [00:05<00:50,  1.12s/it]

[I 2025-06-30 18:56:10,427] Trial 4 finished with value: 0.6572413793103448 and parameters: {'n_estimators': 187, 'learning_rate': 0.09697859288208036, 'max_depth': 10, 'num_leaves': 42, 'min_child_samples': 45, 'subsample': 0.6752632648140585, 'colsample_bytree': 0.9556743770005429}. Best is trial 2 with value: 0.7048275862068966.


Best trial: 5. Best value: 0.910575:  12%|█▏        | 6/50 [00:06<00:44,  1.02s/it]

[I 2025-06-30 18:56:11,253] Trial 5 finished with value: 0.9105747126436782 and parameters: {'n_estimators': 128, 'learning_rate': 0.014554857766166909, 'max_depth': 4, 'num_leaves': 69, 'min_child_samples': 32, 'subsample': 0.8104465472368176, 'colsample_bytree': 0.6276939094146502}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  14%|█▍        | 7/50 [00:08<00:58,  1.36s/it]

[I 2025-06-30 18:56:13,319] Trial 6 finished with value: 0.5882758620689655 and parameters: {'n_estimators': 250, 'learning_rate': 0.06532001433077816, 'max_depth': 9, 'num_leaves': 41, 'min_child_samples': 13, 'subsample': 0.9010052104549857, 'colsample_bytree': 0.7220910479151434}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  16%|█▌        | 8/50 [00:09<00:50,  1.20s/it]

[I 2025-06-30 18:56:14,168] Trial 7 finished with value: 0.7597701149425288 and parameters: {'n_estimators': 119, 'learning_rate': 0.06904038886159725, 'max_depth': 4, 'num_leaves': 60, 'min_child_samples': 45, 'subsample': 0.9806880464832293, 'colsample_bytree': 0.9302937671632896}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  18%|█▊        | 9/50 [00:10<00:45,  1.12s/it]

[I 2025-06-30 18:56:15,105] Trial 8 finished with value: 0.7459770114942529 and parameters: {'n_estimators': 121, 'learning_rate': 0.0416616521060133, 'max_depth': 12, 'num_leaves': 31, 'min_child_samples': 34, 'subsample': 0.6016043891441043, 'colsample_bytree': 0.8754014143882602}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  20%|██        | 10/50 [00:11<00:44,  1.11s/it]

[I 2025-06-30 18:56:16,214] Trial 9 finished with value: 0.6505747126436782 and parameters: {'n_estimators': 161, 'learning_rate': 0.10421615080365149, 'max_depth': 8, 'num_leaves': 55, 'min_child_samples': 34, 'subsample': 0.7257048544526679, 'colsample_bytree': 0.8320440023185659}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  22%|██▏       | 11/50 [00:12<00:46,  1.18s/it]

[I 2025-06-30 18:56:17,558] Trial 10 finished with value: 0.8691954022988506 and parameters: {'n_estimators': 224, 'learning_rate': 0.011201939058785387, 'max_depth': 5, 'num_leaves': 80, 'min_child_samples': 24, 'subsample': 0.8090623131897657, 'colsample_bytree': 0.6071485725444605}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  24%|██▍       | 12/50 [00:13<00:44,  1.16s/it]

[I 2025-06-30 18:56:18,661] Trial 11 finished with value: 0.8485057471264369 and parameters: {'n_estimators': 232, 'learning_rate': 0.013143212494575757, 'max_depth': 5, 'num_leaves': 80, 'min_child_samples': 24, 'subsample': 0.8060618220680413, 'colsample_bytree': 0.6072039811138609}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  26%|██▌       | 13/50 [00:15<00:43,  1.18s/it]

[I 2025-06-30 18:56:19,871] Trial 12 finished with value: 0.8075862068965517 and parameters: {'n_estimators': 226, 'learning_rate': 0.015439095116811578, 'max_depth': 5, 'num_leaves': 79, 'min_child_samples': 26, 'subsample': 0.8112480561603898, 'colsample_bytree': 0.6696315551757601}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  28%|██▊       | 14/50 [00:16<00:42,  1.19s/it]

[I 2025-06-30 18:56:21,085] Trial 13 finished with value: 0.6910344827586207 and parameters: {'n_estimators': 203, 'learning_rate': 0.03917044866376569, 'max_depth': 5, 'num_leaves': 72, 'min_child_samples': 21, 'subsample': 0.7440027187309903, 'colsample_bytree': 0.7672579188009249}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  30%|███       | 15/50 [00:17<00:41,  1.19s/it]

[I 2025-06-30 18:56:22,277] Trial 14 finished with value: 0.7735632183908047 and parameters: {'n_estimators': 299, 'learning_rate': 0.04356326461928324, 'max_depth': 3, 'num_leaves': 73, 'min_child_samples': 29, 'subsample': 0.8379346336390906, 'colsample_bytree': 0.670636785008359}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  32%|███▏      | 16/50 [00:18<00:40,  1.20s/it]

[I 2025-06-30 18:56:23,510] Trial 15 finished with value: 0.7255172413793105 and parameters: {'n_estimators': 152, 'learning_rate': 0.027245410807788507, 'max_depth': 6, 'num_leaves': 70, 'min_child_samples': 20, 'subsample': 0.6775109406725263, 'colsample_bytree': 0.806913433609824}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  34%|███▍      | 17/50 [00:19<00:37,  1.13s/it]

[I 2025-06-30 18:56:24,461] Trial 16 finished with value: 0.7186206896551723 and parameters: {'n_estimators': 212, 'learning_rate': 0.06011534159188067, 'max_depth': 4, 'num_leaves': 22, 'min_child_samples': 38, 'subsample': 0.9646574993772924, 'colsample_bytree': 0.6095221727731172}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  36%|███▌      | 18/50 [00:20<00:35,  1.12s/it]

[I 2025-06-30 18:56:25,560] Trial 17 finished with value: 0.9105747126436782 and parameters: {'n_estimators': 144, 'learning_rate': 0.010905939425416952, 'max_depth': 6, 'num_leaves': 54, 'min_child_samples': 29, 'subsample': 0.7634433117944978, 'colsample_bytree': 0.6729903841902101}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  38%|███▊      | 19/50 [00:21<00:32,  1.05s/it]

[I 2025-06-30 18:56:26,460] Trial 18 finished with value: 0.8213793103448275 and parameters: {'n_estimators': 145, 'learning_rate': 0.03347137487598999, 'max_depth': 6, 'num_leaves': 48, 'min_child_samples': 49, 'subsample': 0.7035817205778234, 'colsample_bytree': 0.6814603886107802}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  40%|████      | 20/50 [00:22<00:30,  1.02s/it]

[I 2025-06-30 18:56:27,410] Trial 19 finished with value: 0.8349425287356322 and parameters: {'n_estimators': 103, 'learning_rate': 0.02523955604738498, 'max_depth': 12, 'num_leaves': 55, 'min_child_samples': 30, 'subsample': 0.6152396279457476, 'colsample_bytree': 0.7547015858834994}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  42%|████▏     | 21/50 [00:23<00:28,  1.01it/s]

[I 2025-06-30 18:56:28,324] Trial 20 finished with value: 0.7117241379310345 and parameters: {'n_estimators': 136, 'learning_rate': 0.0526929272357087, 'max_depth': 6, 'num_leaves': 49, 'min_child_samples': 39, 'subsample': 0.7639552521801275, 'colsample_bytree': 0.8662991916920607}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  44%|████▍     | 22/50 [00:24<00:28,  1.03s/it]

[I 2025-06-30 18:56:29,442] Trial 21 finished with value: 0.9036781609195403 and parameters: {'n_estimators': 180, 'learning_rate': 0.012343831018930121, 'max_depth': 4, 'num_leaves': 75, 'min_child_samples': 29, 'subsample': 0.7946188126589926, 'colsample_bytree': 0.6512200536060809}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 5. Best value: 0.910575:  46%|████▌     | 23/50 [00:25<00:27,  1.04s/it]

[I 2025-06-30 18:56:30,500] Trial 22 finished with value: 0.8078160919540229 and parameters: {'n_estimators': 187, 'learning_rate': 0.02332212383044825, 'max_depth': 4, 'num_leaves': 67, 'min_child_samples': 28, 'subsample': 0.8556817929639109, 'colsample_bytree': 0.6738934289164392}. Best is trial 5 with value: 0.9105747126436782.


Best trial: 23. Best value: 0.931034:  48%|████▊     | 24/50 [00:26<00:26,  1.00s/it]

[I 2025-06-30 18:56:31,418] Trial 23 finished with value: 0.9310344827586207 and parameters: {'n_estimators': 176, 'learning_rate': 0.010870704349832255, 'max_depth': 3, 'num_leaves': 59, 'min_child_samples': 33, 'subsample': 0.7805541334893635, 'colsample_bytree': 0.6435270716715916}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  50%|█████     | 25/50 [00:27<00:23,  1.05it/s]

[I 2025-06-30 18:56:32,243] Trial 24 finished with value: 0.9036781609195403 and parameters: {'n_estimators': 137, 'learning_rate': 0.026108261769540238, 'max_depth': 3, 'num_leaves': 57, 'min_child_samples': 32, 'subsample': 0.765319488074004, 'colsample_bytree': 0.698127858617611}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  52%|█████▏    | 26/50 [00:28<00:22,  1.05it/s]

[I 2025-06-30 18:56:33,195] Trial 25 finished with value: 0.8351724137931035 and parameters: {'n_estimators': 164, 'learning_rate': 0.0497978719627479, 'max_depth': 3, 'num_leaves': 62, 'min_child_samples': 37, 'subsample': 0.7208371092888796, 'colsample_bytree': 0.6440497998361633}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  54%|█████▍    | 27/50 [00:29<00:21,  1.08it/s]

[I 2025-06-30 18:56:34,071] Trial 26 finished with value: 0.7114942528735633 and parameters: {'n_estimators': 102, 'learning_rate': 0.0741146079916432, 'max_depth': 6, 'num_leaves': 49, 'min_child_samples': 32, 'subsample': 0.6890942327754204, 'colsample_bytree': 0.7579351647160979}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  56%|█████▌    | 28/50 [00:30<00:20,  1.06it/s]

[I 2025-06-30 18:56:35,052] Trial 27 finished with value: 0.7806896551724136 and parameters: {'n_estimators': 153, 'learning_rate': 0.035441953085649645, 'max_depth': 4, 'num_leaves': 53, 'min_child_samples': 19, 'subsample': 0.8905482421508466, 'colsample_bytree': 0.7089151729599528}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  58%|█████▊    | 29/50 [00:31<00:19,  1.05it/s]

[I 2025-06-30 18:56:36,017] Trial 28 finished with value: 0.8694252873563219 and parameters: {'n_estimators': 139, 'learning_rate': 0.021065033356895514, 'max_depth': 7, 'num_leaves': 68, 'min_child_samples': 42, 'subsample': 0.6501159739761548, 'colsample_bytree': 0.7883728493094412}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  60%|██████    | 30/50 [00:33<00:24,  1.21s/it]

[I 2025-06-30 18:56:37,833] Trial 29 finished with value: 0.5680459770114943 and parameters: {'n_estimators': 179, 'learning_rate': 0.11680807123692225, 'max_depth': 8, 'num_leaves': 59, 'min_child_samples': 10, 'subsample': 0.7833757702159005, 'colsample_bytree': 0.6981586603311583}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  62%|██████▏   | 31/50 [00:34<00:21,  1.14s/it]

[I 2025-06-30 18:56:38,819] Trial 30 finished with value: 0.7117241379310345 and parameters: {'n_estimators': 131, 'learning_rate': 0.05344199981554171, 'max_depth': 5, 'num_leaves': 33, 'min_child_samples': 26, 'subsample': 0.8309190681945167, 'colsample_bytree': 0.6484034545168335}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  64%|██████▍   | 32/50 [00:35<00:19,  1.09s/it]

[I 2025-06-30 18:56:39,784] Trial 31 finished with value: 0.8967816091954024 and parameters: {'n_estimators': 196, 'learning_rate': 0.011533010280979535, 'max_depth': 4, 'num_leaves': 77, 'min_child_samples': 32, 'subsample': 0.7825695252588017, 'colsample_bytree': 0.6511430490833743}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  66%|██████▌   | 33/50 [00:36<00:17,  1.06s/it]

[I 2025-06-30 18:56:40,765] Trial 32 finished with value: 0.8760919540229886 and parameters: {'n_estimators': 174, 'learning_rate': 0.03153850523777705, 'max_depth': 3, 'num_leaves': 75, 'min_child_samples': 36, 'subsample': 0.7454515694643884, 'colsample_bytree': 0.6269770767012512}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  68%|██████▊   | 34/50 [00:37<00:16,  1.06s/it]

[I 2025-06-30 18:56:41,831] Trial 33 finished with value: 0.8691954022988506 and parameters: {'n_estimators': 166, 'learning_rate': 0.019040589846820295, 'max_depth': 4, 'num_leaves': 66, 'min_child_samples': 27, 'subsample': 0.8628459544946647, 'colsample_bytree': 0.6489876127375321}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  70%|███████   | 35/50 [00:37<00:14,  1.01it/s]

[I 2025-06-30 18:56:42,660] Trial 34 finished with value: 0.6914942528735633 and parameters: {'n_estimators': 153, 'learning_rate': 0.14181392687047628, 'max_depth': 3, 'num_leaves': 62, 'min_child_samples': 30, 'subsample': 0.8303483916332124, 'colsample_bytree': 0.7186606693901026}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  72%|███████▏  | 36/50 [00:39<00:14,  1.02s/it]

[I 2025-06-30 18:56:43,733] Trial 35 finished with value: 0.8416091954022988 and parameters: {'n_estimators': 177, 'learning_rate': 0.01866674161737824, 'max_depth': 7, 'num_leaves': 70, 'min_child_samples': 41, 'subsample': 0.7936772287730781, 'colsample_bytree': 0.6297187184370182}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  74%|███████▍  | 37/50 [00:40<00:14,  1.09s/it]

[I 2025-06-30 18:56:45,007] Trial 36 finished with value: 0.8829885057471264 and parameters: {'n_estimators': 198, 'learning_rate': 0.010018954167034935, 'max_depth': 5, 'num_leaves': 46, 'min_child_samples': 22, 'subsample': 0.7493599852759565, 'colsample_bytree': 0.7315184020792771}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  76%|███████▌  | 38/50 [00:41<00:12,  1.08s/it]

[I 2025-06-30 18:56:46,066] Trial 37 finished with value: 0.7324137931034483 and parameters: {'n_estimators': 126, 'learning_rate': 0.08304270090690188, 'max_depth': 4, 'num_leaves': 64, 'min_child_samples': 36, 'subsample': 0.7234910241654162, 'colsample_bytree': 0.6865955283890041}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  78%|███████▊  | 39/50 [00:42<00:12,  1.14s/it]

[I 2025-06-30 18:56:47,352] Trial 38 finished with value: 0.670344827586207 and parameters: {'n_estimators': 188, 'learning_rate': 0.031551209721302605, 'max_depth': 11, 'num_leaves': 75, 'min_child_samples': 24, 'subsample': 0.765688210780348, 'colsample_bytree': 0.6007297361764834}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  80%|████████  | 40/50 [00:43<00:10,  1.06s/it]

[I 2025-06-30 18:56:48,220] Trial 39 finished with value: 0.732183908045977 and parameters: {'n_estimators': 115, 'learning_rate': 0.12828214572693278, 'max_depth': 3, 'num_leaves': 52, 'min_child_samples': 34, 'subsample': 0.867350917800929, 'colsample_bytree': 0.6260194957326859}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  82%|████████▏ | 41/50 [00:44<00:10,  1.18s/it]

[I 2025-06-30 18:56:49,687] Trial 40 finished with value: 0.6710344827586207 and parameters: {'n_estimators': 260, 'learning_rate': 0.04368365079000415, 'max_depth': 6, 'num_leaves': 43, 'min_child_samples': 31, 'subsample': 0.826978232531919, 'colsample_bytree': 0.6626651975008017}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  84%|████████▍ | 42/50 [00:45<00:08,  1.12s/it]

[I 2025-06-30 18:56:50,657] Trial 41 finished with value: 0.9036781609195403 and parameters: {'n_estimators': 147, 'learning_rate': 0.02665982991743007, 'max_depth': 3, 'num_leaves': 55, 'min_child_samples': 33, 'subsample': 0.7705282664269228, 'colsample_bytree': 0.6959036215875297}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  86%|████████▌ | 43/50 [00:46<00:07,  1.04s/it]

[I 2025-06-30 18:56:51,499] Trial 42 finished with value: 0.9310344827586207 and parameters: {'n_estimators': 110, 'learning_rate': 0.019171971714250294, 'max_depth': 3, 'num_leaves': 58, 'min_child_samples': 28, 'subsample': 0.9164304279938748, 'colsample_bytree': 0.7377654018202351}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  88%|████████▊ | 44/50 [00:47<00:06,  1.00s/it]

[I 2025-06-30 18:56:52,417] Trial 43 finished with value: 0.869655172413793 and parameters: {'n_estimators': 120, 'learning_rate': 0.017749426685932296, 'max_depth': 4, 'num_leaves': 60, 'min_child_samples': 28, 'subsample': 0.9542919541610017, 'colsample_bytree': 0.9907781787990794}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  90%|█████████ | 45/50 [00:48<00:05,  1.02s/it]

[I 2025-06-30 18:56:53,471] Trial 44 finished with value: 0.8694252873563219 and parameters: {'n_estimators': 111, 'learning_rate': 0.016255706835430542, 'max_depth': 5, 'num_leaves': 65, 'min_child_samples': 17, 'subsample': 0.999843834203986, 'colsample_bytree': 0.7327655310253693}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  92%|█████████▏| 46/50 [00:49<00:03,  1.01it/s]

[I 2025-06-30 18:56:54,395] Trial 45 finished with value: 0.9241379310344827 and parameters: {'n_estimators': 108, 'learning_rate': 0.01126257296120688, 'max_depth': 9, 'num_leaves': 59, 'min_child_samples': 24, 'subsample': 0.9158913528956754, 'colsample_bytree': 0.6645812044458386}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  94%|█████████▍| 47/50 [00:50<00:02,  1.01it/s]

[I 2025-06-30 18:56:55,384] Trial 46 finished with value: 0.7112643678160919 and parameters: {'n_estimators': 108, 'learning_rate': 0.03780495984664625, 'max_depth': 9, 'num_leaves': 58, 'min_child_samples': 25, 'subsample': 0.9336939207444835, 'colsample_bytree': 0.6261069519815493}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  96%|█████████▌| 48/50 [00:51<00:02,  1.02s/it]

[I 2025-06-30 18:56:56,488] Trial 47 finished with value: 0.6229885057471265 and parameters: {'n_estimators': 126, 'learning_rate': 0.10311678878046253, 'max_depth': 10, 'num_leaves': 52, 'min_child_samples': 23, 'subsample': 0.9169590804649255, 'colsample_bytree': 0.7480113330906584}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034:  98%|█████████▊| 49/50 [00:52<00:01,  1.01s/it]

[I 2025-06-30 18:56:57,468] Trial 48 finished with value: 0.8763218390804598 and parameters: {'n_estimators': 113, 'learning_rate': 0.022434961238292944, 'max_depth': 8, 'num_leaves': 62, 'min_child_samples': 35, 'subsample': 0.9079749918497606, 'colsample_bytree': 0.7783721326586229}. Best is trial 23 with value: 0.9310344827586207.


Best trial: 23. Best value: 0.931034: 100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


[I 2025-06-30 18:56:58,566] Trial 49 finished with value: 0.9172413793103449 and parameters: {'n_estimators': 128, 'learning_rate': 0.010455705284692135, 'max_depth': 10, 'num_leaves': 45, 'min_child_samples': 26, 'subsample': 0.8855321732976328, 'colsample_bytree': 0.7170246110540526}. Best is trial 23 with value: 0.9310344827586207.
Fold 1: Accuracy=0.8483, Precision=1.0000, Recall=0.2414, F1=0.3889, Threshold=0.50
Fold 2: Accuracy=0.8621, Precision=1.0000, Recall=0.3103, F1=0.4737, Threshold=0.50
Fold 3: Accuracy=0.8759, Precision=0.9231, Recall=0.4138, F1=0.5714, Threshold=0.50
Fold 4: Accuracy=0.8552, Precision=0.7647, Recall=0.4333, F1=0.5532, Threshold=0.50
Fold 5: Accuracy=0.8611, Precision=1.0000, Recall=0.3103, F1=0.4737, Threshold=0.50

--- Final LightGBM Optimized ---
Mean Accuracy : 0.8605
Mean Precision: 0.9376
Mean Recall   : 0.3418
Mean F1 Score : 0.4922


# CatBoost

In [22]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: CatBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        depth=6,
        auto_class_weights='Balanced',
        verbose=0,  # suppress CatBoost internal logs
        random_seed=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'CatBoost-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified+VerboseOff'

# Console summary
print("\n--- CatBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.7083,  Recall=0.5862, F1=0.6415
Fold 2: Accuracy=0.8483, Precision=0.7059,  Recall=0.4138, F1=0.5217
Fold 3: Accuracy=0.8414, Precision=0.5938,  Recall=0.6552, F1=0.6230
Fold 4: Accuracy=0.8483, Precision=0.6250,  Recall=0.6667, F1=0.6452
Fold 5: Accuracy=0.8750, Precision=0.6774,  Recall=0.7241, F1=0.7000

--- CatBoost Summary ---
Mean Accuracy : 0.8564
Mean Precision: 0.6621
Mean Recall   : 0.6092
Mean F1 Score : 0.6263

CSV Row Format:
CatBoost-Balanced,OneHot+Scaler+5Fold-Stratified+VerboseOff,0.8564,0.6621,0.6092,0.6263


In [23]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Tuned CatBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=5,
        border_count=128,
        bagging_temperature=1.0,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'CatBoost-Tuned'
model_desc = 'OneHot+Scaler+5Fold+Depth8+LR0.05+BagTemp1.0'

# Console summary
print("\n--- CatBoost Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8759, Precision=0.7895,  Recall=0.5172, F1=0.6250
Fold 2: Accuracy=0.8483, Precision=0.7059,  Recall=0.4138, F1=0.5217
Fold 3: Accuracy=0.8345, Precision=0.5806,  Recall=0.6207, F1=0.6000
Fold 4: Accuracy=0.8414, Precision=0.6207,  Recall=0.6000, F1=0.6102
Fold 5: Accuracy=0.8681, Precision=0.7273,  Recall=0.5517, F1=0.6275

--- CatBoost Tuned Summary ---
Mean Accuracy : 0.8536
Mean Precision: 0.6848
Mean Recall   : 0.5407
Mean F1 Score : 0.5969

CSV Row Format:
CatBoost-Tuned,OneHot+Scaler+5Fold+Depth8+LR0.05+BagTemp1.0,0.8536,0.6848,0.5407,0.5969


In [24]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Define column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Aggressively Tuned CatBoost
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=500,
        learning_rate=0.03,
        depth=10,
        l2_leaf_reg=3,
        border_count=128,
        bagging_temperature=0.25,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Description
model_name = 'CatBoost-Aggressive'
model_desc = 'OneHot+Scaler+500Iter+LR0.03+Depth10+Bag0.25'

print("\n--- CatBoost Aggressive Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8828, Precision=0.8750,  Recall=0.4828, F1=0.6222
Fold 2: Accuracy=0.8414, Precision=0.6875,  Recall=0.3793, F1=0.4889
Fold 3: Accuracy=0.8483, Precision=0.6296,  Recall=0.5862, F1=0.6071
Fold 4: Accuracy=0.8552, Precision=0.6667,  Recall=0.6000, F1=0.6316
Fold 5: Accuracy=0.8472, Precision=0.6667,  Recall=0.4828, F1=0.5600

--- CatBoost Aggressive Summary ---
Mean Accuracy : 0.8550
Mean Precision: 0.7051
Mean Recall   : 0.5062
Mean F1 Score : 0.5820

CSV Row Format:
CatBoost-Aggressive,OneHot+Scaler+500Iter+LR0.03+Depth10+Bag0.25,0.8550,0.7051,0.5062,0.5820


In [25]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Constant column check
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Categorical and numerical columns
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Define CatBoost with default settings (will be tuned)
cat_model = CatBoostClassifier(
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=0
)

# Create pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', cat_model)
])

# Define parameter search space
param_space = {
    'model__iterations': Integer(300, 800),
    'model__learning_rate': Real(0.01, 0.2, prior='log-uniform'),
    'model__depth': Integer(4, 10),
    'model__l2_leaf_reg': Real(1, 10),
    'model__bagging_temperature': Real(0, 1.0),
    'model__border_count': Integer(32, 254)
}

# Setup Bayesian optimization with 5-fold stratified CV
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Fit the search
opt.fit(X, y)

# Extract best pipeline and evaluate manually
best_pipeline = opt.best_estimator_

# Manual 5-Fold Eval
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Averages
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'CatBoost-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+5Fold'

print("\n--- CatBoost Bayesian Tuning Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8897, Precision=0.7826, Recall=0.6207, F1=0.6923
Fold 2: Accuracy=0.8621, Precision=0.7143, Recall=0.5172, F1=0.6000
Fold 3: Accuracy=0.8138, Precision=0.5263, Recall=0.6897, F1=0.5970
Fold 4: Accuracy=0.8621, Precision=0.6471, Recall=0.7333, F1=0.6875
Fold 5: Accuracy=0.8819, Precision=0.7000, Recall=0.7241, F1=0.7119

--- CatBoost Bayesian Tuning Summary ---
Mean Accuracy : 0.8619
Mean Precision: 0.6741
Mean Recall   : 0.6570
Mean F1 Score : 0.6577


# SVM

In [26]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# SVM model inside pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Mean scores
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'SVC-RBF-Pipeline'
model_desc = 'OneHot+Scaler+5Fold+Balanced'

print("\n--- SVM (RBF) Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8069, Precision=0.5143, Recall=0.6207, F1=0.5625
Fold 2: Accuracy=0.8207, Precision=0.5556, Recall=0.5172, F1=0.5357
Fold 3: Accuracy=0.7655, Precision=0.4444, Recall=0.6897, F1=0.5405
Fold 4: Accuracy=0.7793, Precision=0.4783, Recall=0.7333, F1=0.5789
Fold 5: Accuracy=0.7222, Precision=0.3878, Recall=0.6552, F1=0.4872

--- SVM (RBF) Summary ---
Name                          : SVC-RBF-Pipeline
Description                   : OneHot+Scaler+5Fold+Balanced
Accuracy                      : 0.7789
Precision                     : 0.4761
Recall                        : 0.6432
F1 Score                      : 0.5410

CSV Row Format:
SVC-RBF-Pipeline,OneHot+Scaler+5Fold+Balanced,0.7789,0.4761,0.6432,0.5410


In [27]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8414, Precision=0.6000, Recall=0.6207, F1=0.6102
Fold 2: Accuracy=0.8414, Precision=0.6364, Recall=0.4828, F1=0.5490
Fold 3: Accuracy=0.8552, Precision=0.6429, Recall=0.6207, F1=0.6316
Fold 4: Accuracy=0.8069, Precision=0.5357, Recall=0.5000, F1=0.5172
Fold 5: Accuracy=0.8333, Precision=0.5862, Recall=0.5862, F1=0.5862

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.8356
Mean Precision: 0.6002
Mean Recall   : 0.5621
Mean F1 Score : 0.5788

CSV Row Format:
SVC-BayesTuned,OneHot+Scaler+BayesSearch+RBF+Balanced,0.8356,0.6002,0.5621,0.5788


In [28]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8414, Precision=0.6000, Recall=0.6207, F1=0.6102
Fold 2: Accuracy=0.8414, Precision=0.6364, Recall=0.4828, F1=0.5490
Fold 3: Accuracy=0.8552, Precision=0.6429, Recall=0.6207, F1=0.6316
Fold 4: Accuracy=0.8069, Precision=0.5357, Recall=0.5000, F1=0.5172
Fold 5: Accuracy=0.8333, Precision=0.5862, Recall=0.5862, F1=0.5862

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.8356
Mean Precision: 0.6002
Mean Recall   : 0.5621
Mean F1 Score : 0.5788

CSV Row Format:
SVC-BayesTuned,OneHot+Scaler+BayesSearch+RBF+Balanced,0.8356,0.6002,0.5621,0.5788


In [29]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space for BayesSearchCV
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization focused on RECALL
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='recall',  # prioritize recall
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned-Recall'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced+RecallOpt'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333
Fold 2: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333
Fold 3: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333
Fold 4: Accuracy=0.2069, Precision=0.2069, Recall=1.0000, F1=0.3429
Fold 5: Accuracy=0.2014, Precision=0.2014, Recall=1.0000, F1=0.3353

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.2017
Mean Precision: 0.2017
Mean Recall   : 1.0000
Mean F1 Score : 0.3356

CSV Row Format:
SVC-BayesTuned-Recall,OneHot+Scaler+BayesSearch+RBF+Balanced+RecallOpt,0.2017,0.2017,1.0000,0.3356


# Bagging

In [30]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Bagging Classifier pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        n_estimators=50,
        max_samples=0.8,
        max_features=1.0,
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    ))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'Bagging-DecisionTree'
model_desc = 'Bagging-with-Preprocessing-5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8828, Precision=0.8750, Recall=0.4828, F1=0.6222
Fold 2: Accuracy=0.8552, Precision=0.7857, Recall=0.3793, F1=0.5116
Fold 3: Accuracy=0.8759, Precision=0.7391, Recall=0.5862, F1=0.6538
Fold 4: Accuracy=0.8759, Precision=0.8000, Recall=0.5333, F1=0.6400
Fold 5: Accuracy=0.8472, Precision=0.7333, Recall=0.3793, F1=0.5000

--- Model Summary ---
Name                          : Bagging-DecisionTree
Description                   : Bagging-with-Preprocessing-5Fold
Accuracy                      : 0.8674
Precision                     : 0.7866
Recall                        : 0.4722
F1 Score                      : 0.5855

CSV Row Format:
Bagging-DecisionTree,Bagging-with-Preprocessing-5Fold,0.8674,0.7866,0.4722,0.5855


In [31]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        random_state=42,
        n_jobs=-1
    ))
])

# Parameter search space for Bagging + Decision Tree
search_space = {
    'model__n_estimators': Integer(10, 100),
    'model__max_samples': Real(0.5, 1.0),
    'model__max_features': Real(0.5, 1.0),
    'model__estimator__max_depth': Integer(2, 20),
    'model__estimator__min_samples_split': Integer(2, 10),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# BayesSearchCV setup (recall as scoring metric)
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit
bayes_search.fit(X, y)

# Best model
best_model = bayes_search.best_estimator_

# 5-Fold Evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'Bagging+DT-Tuned'
model_desc = 'BayesCV-Tuned-Recall-Max-5Fold'

print("\n--- Final Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.7778, Recall=0.4828, F1=0.5957
Fold 2: Accuracy=0.8552, Precision=0.7857, Recall=0.3793, F1=0.5116
Fold 3: Accuracy=0.8621, Precision=0.6800, Recall=0.5862, F1=0.6296
Fold 4: Accuracy=0.8828, Precision=0.7600, Recall=0.6333, F1=0.6909
Fold 5: Accuracy=0.8750, Precision=0.8235, Recall=0.4828, F1=0.6087

--- Final Tuned Model Summary ---
Name                          : Bagging+DT-Tuned
Description                   : BayesCV-Tuned-Recall-Max-5Fold
Accuracy                      : 0.8688
Precision                     : 0.7654
Recall                        : 0.5129
F1 Score                      : 0.6073

CSV Row Format:
Bagging+DT-Tuned,BayesCV-Tuned-Recall-Max-5Fold,0.8688,0.7654,0.5129,0.6073


In [32]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        random_state=42,
        n_jobs=-1
    ))
])

# Parameter search space for Bagging + Decision Tree
search_space = {
    'model__n_estimators': Integer(10, 100),
    'model__max_samples': Real(0.5, 1.0),
    'model__max_features': Real(0.5, 1.0),
    'model__estimator__max_depth': Integer(2, 20),
    'model__estimator__min_samples_split': Integer(2, 10),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# BayesSearchCV setup (recall as scoring metric)
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit
bayes_search.fit(X, y)

# Best model
best_model = bayes_search.best_estimator_

# 5-Fold Evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'Bagging+DT-Tuned'
model_desc = 'BayesCV-Tuned-Recall-Max-5Fold'

print("\n--- Final Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.7778, Recall=0.4828, F1=0.5957
Fold 2: Accuracy=0.8552, Precision=0.7857, Recall=0.3793, F1=0.5116
Fold 3: Accuracy=0.8621, Precision=0.6800, Recall=0.5862, F1=0.6296
Fold 4: Accuracy=0.8828, Precision=0.7600, Recall=0.6333, F1=0.6909
Fold 5: Accuracy=0.8750, Precision=0.8235, Recall=0.4828, F1=0.6087

--- Final Tuned Model Summary ---
Name                          : Bagging+DT-Tuned
Description                   : BayesCV-Tuned-Recall-Max-5Fold
Accuracy                      : 0.8688
Precision                     : 0.7654
Recall                        : 0.5129
F1 Score                      : 0.6073

CSV Row Format:
Bagging+DT-Tuned,BayesCV-Tuned-Recall-Max-5Fold,0.8688,0.7654,0.5129,0.6073


In [33]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# --- Data Setup ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Pipeline ---
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(class_weight='balanced'),
        n_estimators=50,
        max_samples=0.8,
        max_features=1.0,
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    ))
])

# --- Search Space ---
search_space = {
    'model__n_estimators': Integer(20, 100),
    'model__max_samples': Real(0.4, 1.0),
    'model__max_features': Real(0.4, 1.0),
    'model__estimator__max_depth': Integer(3, 20),
    'model__estimator__min_samples_split': Integer(2, 15),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# --- Tuning ---
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# --- Fit ---
bayes_search.fit(X, y)
best_model = bayes_search.best_estimator_

# --- Cross-Validation Evaluation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# --- Final Metrics ---
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# --- Output ---
model_name = 'Bagging+DT-Balanced-Tuned'
model_desc = 'BaggingDT+Balanced+BayesCV-Recall'

print("\n--- Final Tuned Bagging Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8345, Precision=0.5610, Recall=0.7931, F1=0.6571
Fold 2: Accuracy=0.8000, Precision=0.5000, Recall=0.7241, F1=0.5915
Fold 3: Accuracy=0.7379, Precision=0.4211, Recall=0.8276, F1=0.5581
Fold 4: Accuracy=0.7586, Precision=0.4576, Recall=0.9000, F1=0.6067
Fold 5: Accuracy=0.7431, Precision=0.4231, Recall=0.7586, F1=0.5432

--- Final Tuned Bagging Model Summary ---
Name                          : Bagging+DT-Balanced-Tuned
Description                   : BaggingDT+Balanced+BayesCV-Recall
Accuracy                      : 0.7748
Precision                     : 0.4725
Recall                        : 0.8007
F1 Score                      : 0.5914

CSV Row Format:
Bagging+DT-Balanced-Tuned,BaggingDT+Balanced+BayesCV-Recall,0.7748,0.4725,0.8007,0.5914


# AdaBoostClassifier

In [34]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ]
)

# AdaBoost model pipeline
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', AdaBoostClassifier(random_state=42))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'AdaBoostClassifier'
model_desc = 'AdaBoost-5Fold-Preprocessed'

# Print formatted summary
print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV (append row, create file if not exists)
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.8125, Recall=0.4483, F1=0.5778
Fold 2: Accuracy=0.8690, Precision=0.9167, Recall=0.3793, F1=0.5366
Fold 3: Accuracy=0.8690, Precision=0.7273, Recall=0.5517, F1=0.6275
Fold 4: Accuracy=0.8552, Precision=0.6957, Recall=0.5333, F1=0.6038
Fold 5: Accuracy=0.8819, Precision=0.7500, Recall=0.6207, F1=0.6792

--- Model Summary ---
Name                          : AdaBoostClassifier
Description                   : AdaBoost-5Fold-Preprocessed
Accuracy                      : 0.8688
Precision                     : 0.7804
Recall                        : 0.5067
F1 Score                      : 0.6050

CSV Row Format:
AdaBoostClassifier,AdaBoost-5Fold-Preprocessed,0.8688,0.7804,0.5067,0.6050


In [35]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from skopt import BayesSearchCV
from skopt.space import Integer, Real

import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# AdaBoost pipeline
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', AdaBoostClassifier(algorithm='SAMME', random_state=42))
])

# Hyperparameter search space (no 'SAMME.R')
search_space = {
    'classifier__n_estimators': Integer(50, 300),
    'classifier__learning_rate': Real(0.01, 1.0, prior='log-uniform')
}

# CV and tuner
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
opt = BayesSearchCV(
    pipe,
    search_spaces=search_space,
    scoring='recall',
    n_iter=25,
    cv=cv,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Fit the tuner
opt.fit(X, y)

# Final best model
best_model = opt.best_estimator_

# CV metric evaluation using best model
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'AdaBoostClassifier-Tuned'
model_desc = f"AdaBoost-Tuned-SAMME-{opt.best_params_}"

# Print summary
print("\n--- Tuned AdaBoost Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# CSV write
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8828, Precision=0.8000, Recall=0.5517, F1=0.6531
Fold 2: Accuracy=0.8621, Precision=0.8000, Recall=0.4138, F1=0.5455
Fold 3: Accuracy=0.8621, Precision=0.7143, Recall=0.5172, F1=0.6000
Fold 4: Accuracy=0.8483, Precision=0.6538, Recall=0.5667, F1=0.6071
Fold 5: Accuracy=0.8542, Precision=0.6538, Recall=0.5862, F1=0.6182

--- Tuned AdaBoost Summary ---
Name                          : AdaBoostClassifier-Tuned
Description                   : AdaBoost-Tuned-SAMME-OrderedDict({'classifier__learning_rate': 1.0, 'classifier__n_estimators': 180})
Accuracy                      : 0.8619
Precision                     : 0.7244
Recall                        : 0.5271
F1 Score                      : 0.6048


# BalancedBaggingClassifier with a DecisionTreeClassifier(max_depth=6)

In [38]:
import os
import numpy as np
import pandas as pd
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Assume df_clean is preloaded
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Preprocessing (same as your other pipelines)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# Classifier setup
base_estimator = DecisionTreeClassifier(max_depth=6, random_state=42)
clf = BalancedBaggingClassifier(
    estimator=base_estimator,
    n_estimators=50,
    sampling_strategy='auto',
    replacement=False,
    random_state=42,
    n_jobs=-1
)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Aggregate results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model metadata
model_name = 'BalancedBagging-DecisionTree'
model_desc = 'Bagging+Balanced+DT(max_depth=6)+5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save results
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8276, Precision=0.5526, Recall=0.7241, F1=0.6269
Fold 2: Accuracy=0.8690, Precision=0.6786, Recall=0.6552, F1=0.6667
Fold 3: Accuracy=0.7448, Precision=0.4259, Recall=0.7931, F1=0.5542
Fold 4: Accuracy=0.8069, Precision=0.5217, Recall=0.8000, F1=0.6316
Fold 5: Accuracy=0.8542, Precision=0.6053, Recall=0.7931, F1=0.6866

--- Model Summary ---
Name                          : BalancedBagging-DecisionTree
Description                   : Bagging+Balanced+DT(max_depth=6)+5Fold
Accuracy                      : 0.8205
Precision                     : 0.5568
Recall                        : 0.7531
F1 Score                      : 0.6332

CSV Row Format:
BalancedBagging-DecisionTree,Bagging+Balanced+DT(max_depth=6)+5Fold,0.8205,0.5568,0.7531,0.6332


# EasyEnsembleClassifier

In [39]:
import os
import numpy as np
import pandas as pd
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Dataset
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Feature columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# Classifier: EasyEnsemble with default AdaBoost base
clf = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

# Stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Aggregate metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'EasyEnsembleClassifier'
model_desc = 'Ensemble+Undersampling+AdaBoost+5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8345, Precision=0.5581, Recall=0.8276, F1=0.6667
Fold 2: Accuracy=0.8207, Precision=0.5429, Recall=0.6552, F1=0.5938
Fold 3: Accuracy=0.7448, Precision=0.4231, Recall=0.7586, F1=0.5432
Fold 4: Accuracy=0.8069, Precision=0.5208, Recall=0.8333, F1=0.6410
Fold 5: Accuracy=0.7986, Precision=0.5000, Recall=0.9310, F1=0.6506

--- Model Summary ---
Name                          : EasyEnsembleClassifier
Description                   : Ensemble+Undersampling+AdaBoost+5Fold
Accuracy                      : 0.8011
Precision                     : 0.5090
Recall                        : 0.8011
F1 Score                      : 0.6191

CSV Row Format:
EasyEnsembleClassifier,Ensemble+Undersampling+AdaBoost+5Fold,0.8011,0.5090,0.8011,0.6191


# EasyEnsembleClassifier

In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.ensemble import EasyEnsembleClassifier
import warnings

warnings.filterwarnings('ignore')

# Target and features
current_df = df_clean.copy()  # Ensure `df_clean` is already cleaned
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Initialize EasyEnsembleClassifier
base_model = EasyEnsembleClassifier(random_state=42, n_estimators=10)

# Preprocessing for categorical and numeric features
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Feature selection
feature_selector = SelectFromModel(estimator=RandomForestClassifier(random_state=42), max_features=20)

# Threshold tuning range
thresholds = np.linspace(0.1, 0.5, 5)

# Pipeline setup
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('classifier', base_model)
])

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_probs = pipeline.predict_proba(X_val)[:, 1]

    # Threshold tuning
    best_metrics = {'acc': 0, 'prec': 0, 'rec': 0, 'f1': 0, 'threshold': 0}
    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        
        if f1 > best_metrics['f1']:
            best_metrics = {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'threshold': thresh}
    
    accuracy_list.append(best_metrics['acc'])
    precision_list.append(best_metrics['prec'])
    recall_list.append(best_metrics['rec'])
    f1_list.append(best_metrics['f1'])
    
    print(f"Fold {fold}: Threshold={best_metrics['threshold']:.2f}, Accuracy={best_metrics['acc']:.4f}, "
          f"Precision={best_metrics['prec']:.4f}, Recall={best_metrics['rec']:.4f}, F1={best_metrics['f1']:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'EasyEnsembleClassifier+ThresholdTuning'
model_desc = 'EEC-ThresholdTuning-5Fold'

# Print formatted summary
print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Threshold=0.50, Accuracy=0.8483, Precision=0.5854, Recall=0.8276, F1=0.6857
Fold 2: Threshold=0.50, Accuracy=0.8207, Precision=0.5429, Recall=0.6552, F1=0.5938
Fold 3: Threshold=0.50, Accuracy=0.7379, Precision=0.4118, Recall=0.7241, F1=0.5250
Fold 4: Threshold=0.50, Accuracy=0.8000, Precision=0.5094, Recall=0.9000, F1=0.6506
Fold 5: Threshold=0.50, Accuracy=0.8194, Precision=0.5283, Recall=0.9655, F1=0.6829

--- Model Summary ---
Name                          : EasyEnsembleClassifier+ThresholdTuning
Description                   : EEC-ThresholdTuning-5Fold
Accuracy                      : 0.8053
Precision                     : 0.5155
Recall                        : 0.8145
F1 Score                      : 0.6276


In [11]:
import os
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

# --- Custom Transformer for Top 20 Features ---
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, model, k=20):
        self.model = model
        self.k = k
        self.top_indices = None

    def fit(self, X, y):
        self.model.fit(X, y)
        if hasattr(self.model, "feature_importances_"):
            importances = self.model.feature_importances_
        else:
            raise AttributeError("Model must have feature_importances_")
        self.top_indices = np.argsort(importances)[::-1][:self.k]
        return self

    def transform(self, X):
        return X[:, self.top_indices]

# --- Load data ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# --- Identify column types ---
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Pipeline components ---
rf_for_selection = RandomForestClassifier(n_estimators=100, random_state=42)
top_k_selector = TopFeatureSelector(model=rf_for_selection, k=20)

model = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', top_k_selector),
    ('model', model)
])

# --- CV and threshold tuning ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.arange(0.1, 0.51, 0.05)

best_metrics = {'threshold': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}

# Try all thresholds
for threshold in thresholds:
    acc_list, prec_list, rec_list, f1_list = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        y_proba = pipeline.predict_proba(X_val)[:, 1]
        y_pred = (y_proba >= threshold).astype(int)

        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        acc_list.append(acc)
        prec_list.append(prec)
        rec_list.append(rec)
        f1_list.append(f1)

    mean_acc = np.mean(acc_list)
    mean_prec = np.mean(prec_list)
    mean_rec = np.mean(rec_list)
    mean_f1 = np.mean(f1_list)

    print(f"Threshold={threshold:.2f} | Accuracy={mean_acc:.4f}, Precision={mean_prec:.4f}, Recall={mean_rec:.4f}, F1={mean_f1:.4f}")

    if (
        mean_rec > best_metrics['recall'] and
        mean_prec > 0.6 and
        mean_f1 > 0.7 and
        mean_acc > 0.8
    ):
        best_metrics.update({
            'threshold': threshold,
            'accuracy': mean_acc,
            'precision': mean_prec,
            'recall': mean_rec,
            'f1': mean_f1
        })

# --- Fallback if no threshold met all strict criteria ---
if best_metrics['f1'] == 0:
    print("\nNo threshold met strict criteria. Falling back to best F1 score.")
    best_f1 = 0
    for threshold in thresholds:
        acc_list, prec_list, rec_list, f1_list = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            pipeline.fit(X_train, y_train)
            y_proba = pipeline.predict_proba(X_val)[:, 1]
            y_pred = (y_proba >= threshold).astype(int)

            acc_list.append(accuracy_score(y_val, y_pred))
            prec_list.append(precision_score(y_val, y_pred, zero_division=0))
            rec_list.append(recall_score(y_val, y_pred))
            f1_list.append(f1_score(y_val, y_pred))

        mean_f1 = np.mean(f1_list)
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_metrics.update({
                'threshold': threshold,
                'accuracy': np.mean(acc_list),
                'precision': np.mean(prec_list),
                'recall': np.mean(rec_list),
                'f1': mean_f1
            })

# --- Reporting ---
model_name = 'EasyEnsemble-Top20Feat+Thresh'
model_desc = f'5Fold-EEC+Top20Selector+ThreshTuned-{best_metrics["threshold"]:.2f}'

print("\n--- Best Threshold Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {best_metrics['accuracy']:.4f}")
print(f"{'Precision':<30}: {best_metrics['precision']:.4f}")
print(f"{'Recall':<30}: {best_metrics['recall']:.4f}")
print(f"{'F1 Score':<30}: {best_metrics['f1']:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{best_metrics['accuracy']:.4f},{best_metrics['precision']:.4f},{best_metrics['recall']:.4f},{best_metrics['f1']:.4f}")

# --- Save results ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(best_metrics['accuracy'], 4),
    'Precision': round(best_metrics['precision'], 4),
    'Recall': round(best_metrics['recall'], 4),
    'F1 Score': round(best_metrics['f1'], 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Threshold=0.10 | Accuracy=0.2017, Precision=0.2017, Recall=1.0000, F1=0.3356
Threshold=0.15 | Accuracy=0.2017, Precision=0.2017, Recall=1.0000, F1=0.3356
Threshold=0.20 | Accuracy=0.2017, Precision=0.2017, Recall=1.0000, F1=0.3356
Threshold=0.25 | Accuracy=0.2044, Precision=0.2022, Recall=1.0000, F1=0.3364
Threshold=0.30 | Accuracy=0.2058, Precision=0.2025, Recall=1.0000, F1=0.3368
Threshold=0.35 | Accuracy=0.2472, Precision=0.2113, Recall=1.0000, F1=0.3489
Threshold=0.40 | Accuracy=0.4378, Precision=0.2644, Recall=1.0000, F1=0.4181
Threshold=0.45 | Accuracy=0.6243, Precision=0.3458, Recall=0.9586, F1=0.5077
Threshold=0.50 | Accuracy=0.7970, Precision=0.5018, Recall=0.8152, F1=0.6186

No threshold met strict criteria. Falling back to best F1 score.

--- Best Threshold Tuned Model Summary ---
Name                          : EasyEnsemble-Top20Feat+Thresh
Description                   : 5Fold-EEC+Top20Selector+ThreshTuned-0.50
Accuracy                      : 0.7970
Precision              

# BalancedBaggingClassifier + LightGBM

In [12]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# --- Data setup ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# --- Column types ---
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Classifier setup ---
lgbm = LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

bbc = BalancedBaggingClassifier(
    estimator=lgbm,
    n_estimators=10,
    sampling_strategy='auto',
    replacement=False,
    random_state=42,
    n_jobs=-1
)

# --- Full pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', bbc)
])

# --- Evaluation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.arange(0.1, 0.51, 0.05)
best_metrics = {'threshold': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}

for threshold in thresholds:
    acc_list, prec_list, rec_list, f1_list = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        y_proba = pipeline.predict_proba(X_val)[:, 1]
        y_pred = (y_proba >= threshold).astype(int)

        acc_list.append(accuracy_score(y_val, y_pred))
        prec_list.append(precision_score(y_val, y_pred, zero_division=0))
        rec_list.append(recall_score(y_val, y_pred))
        f1_list.append(f1_score(y_val, y_pred))

    mean_acc = np.mean(acc_list)
    mean_prec = np.mean(prec_list)
    mean_rec = np.mean(rec_list)
    mean_f1 = np.mean(f1_list)

    print(f"Threshold={threshold:.2f} | Accuracy={mean_acc:.4f}, Precision={mean_prec:.4f}, Recall={mean_rec:.4f}, F1={mean_f1:.4f}")

    if (
        mean_rec > best_metrics['recall'] and
        mean_prec > 0.6 and
        mean_f1 > 0.7 and
        mean_acc > 0.8
    ):
        best_metrics.update({
            'threshold': threshold,
            'accuracy': mean_acc,
            'precision': mean_prec,
            'recall': mean_rec,
            'f1': mean_f1
        })

# --- Fallback to best F1 if no strict threshold matched ---
if best_metrics['f1'] == 0:
    print("\nNo threshold met strict criteria. Falling back to best F1 score.")
    best_f1 = 0
    for threshold in thresholds:
        acc_list, prec_list, rec_list, f1_list = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            pipeline.fit(X_train, y_train)
            y_proba = pipeline.predict_proba(X_val)[:, 1]
            y_pred = (y_proba >= threshold).astype(int)

            acc_list.append(accuracy_score(y_val, y_pred))
            prec_list.append(precision_score(y_val, y_pred, zero_division=0))
            rec_list.append(recall_score(y_val, y_pred))
            f1_list.append(f1_score(y_val, y_pred))

        mean_f1 = np.mean(f1_list)
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_metrics.update({
                'threshold': threshold,
                'accuracy': np.mean(acc_list),
                'precision': np.mean(prec_list),
                'recall': np.mean(rec_list),
                'f1': mean_f1
            })

# --- Reporting ---
model_name = 'BalancedBagging-LGBM'
model_desc = f'5Fold-BBC+LGBM-ThresholdTuned-{best_metrics["threshold"]:.2f}'

print("\n--- Best Threshold Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {best_metrics['accuracy']:.4f}")
print(f"{'Precision':<30}: {best_metrics['precision']:.4f}")
print(f"{'Recall':<30}: {best_metrics['recall']:.4f}")
print(f"{'F1 Score':<30}: {best_metrics['f1']:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{best_metrics['accuracy']:.4f},{best_metrics['precision']:.4f},{best_metrics['recall']:.4f},{best_metrics['f1']:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(best_metrics['accuracy'], 4),
    'Precision': round(best_metrics['precision'], 4),
    'Recall': round(best_metrics['recall'], 4),
    'F1 Score': round(best_metrics['f1'], 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Threshold=0.10 | Accuracy=0.6146, Precision=0.3422, Recall=0.9724, F1=0.5057
Threshold=0.15 | Accuracy=0.6767, Precision=0.3815, Recall=0.9448, F1=0.5426
Threshold=0.20 | Accuracy=0.7072, Precision=0.4049, Recall=0.9313, F1=0.5633
Threshold=0.25 | Accuracy=0.7362, Precision=0.4296, Recall=0.8970, F1=0.5791
Threshold=0.30 | Accuracy=0.7679, Precision=0.4648, Recall=0.8628, F1=0.6019
Threshold=0.35 | Accuracy=0.7901, Precision=0.4945, Recall=0.8287, F1=0.6167
Threshold=0.40 | Accuracy=0.8025, Precision=0.5141, Recall=0.8085, F1=0.6252
Threshold=0.45 | Accuracy=0.8232, Precision=0.5511, Recall=0.7809, F1=0.6433
Threshold=0.50 | Accuracy=0.8329, Precision=0.5719, Recall=0.7674, F1=0.6523

No threshold met strict criteria. Falling back to best F1 score.

--- Best Threshold Tuned Model Summary ---
Name                          : BalancedBagging-LGBM
Description                   : 5Fold-BBC+LGBM-ThresholdTuned-0.50
Accuracy                      : 0.8329
Precision                     : 0.5719

# StackingClassifier with Threshold Tuning

In [13]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier

import warnings
warnings.filterwarnings('ignore')

# --- Data prep ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Base models ---
cat = CatBoostClassifier(verbose=0, random_state=42)
lgbm = LGBMClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
svc = SVC(kernel='rbf', C=1, probability=True, random_state=42)
bbc = BalancedBaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=6, random_state=42),
    n_estimators=10,
    sampling_strategy='auto',
    replacement=False,
    random_state=42,
    n_jobs=-1
)

# --- Meta model ---
meta_model = LogisticRegression(max_iter=1000)

# --- Stacking ---
stacking_model = StackingClassifier(
    estimators=[
        ('cat', cat),
        ('lgbm', lgbm),
        ('svc', svc),
        ('bbc', bbc)
    ],
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=True
)

# --- Full pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', stacking_model)
])

# --- CV + Threshold tuning ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.arange(0.1, 0.51, 0.05)
best_metrics = {'threshold': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}

for threshold in thresholds:
    acc_list, prec_list, rec_list, f1_list = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        y_proba = pipeline.predict_proba(X_val)[:, 1]
        y_pred = (y_proba >= threshold).astype(int)

        acc_list.append(accuracy_score(y_val, y_pred))
        prec_list.append(precision_score(y_val, y_pred, zero_division=0))
        rec_list.append(recall_score(y_val, y_pred))
        f1_list.append(f1_score(y_val, y_pred))

    mean_acc = np.mean(acc_list)
    mean_prec = np.mean(prec_list)
    mean_rec = np.mean(rec_list)
    mean_f1 = np.mean(f1_list)

    print(f"Threshold={threshold:.2f} | Accuracy={mean_acc:.4f}, Precision={mean_prec:.4f}, Recall={mean_rec:.4f}, F1={mean_f1:.4f}")

    if (
        mean_rec > best_metrics['recall'] and
        mean_prec > 0.6 and
        mean_f1 > 0.7 and
        mean_acc > 0.8
    ):
        best_metrics.update({
            'threshold': threshold,
            'accuracy': mean_acc,
            'precision': mean_prec,
            'recall': mean_rec,
            'f1': mean_f1
        })

# --- Fallback: best F1 if strict criteria fail ---
if best_metrics['f1'] == 0:
    print("\nNo threshold met strict criteria. Falling back to best F1 score.")
    best_f1 = 0
    for threshold in thresholds:
        acc_list, prec_list, rec_list, f1_list = [], [], [], []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            pipeline.fit(X_train, y_train)
            y_proba = pipeline.predict_proba(X_val)[:, 1]
            y_pred = (y_proba >= threshold).astype(int)

            acc_list.append(accuracy_score(y_val, y_pred))
            prec_list.append(precision_score(y_val, y_pred, zero_division=0))
            rec_list.append(recall_score(y_val, y_pred))
            f1_list.append(f1_score(y_val, y_pred))

        mean_f1 = np.mean(f1_list)
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_metrics.update({
                'threshold': threshold,
                'accuracy': np.mean(acc_list),
                'precision': np.mean(prec_list),
                'recall': np.mean(rec_list),
                'f1': mean_f1
            })

# --- Reporting ---
model_name = 'Stacking-CatLGBMSVCBBC'
model_desc = f'StackingCatLGBMSVCBBC+LogRegMeta+ThreshTuned-{best_metrics["threshold"]:.2f}'

print("\n--- Best Threshold Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {best_metrics['accuracy']:.4f}")
print(f"{'Precision':<30}: {best_metrics['precision']:.4f}")
print(f"{'Recall':<30}: {best_metrics['recall']:.4f}")
print(f"{'F1 Score':<30}: {best_metrics['f1']:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{best_metrics['accuracy']:.4f},{best_metrics['precision']:.4f},{best_metrics['recall']:.4f},{best_metrics['f1']:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(best_metrics['accuracy'], 4),
    'Precision': round(best_metrics['precision'], 4),
    'Recall': round(best_metrics['recall'], 4),
    'F1 Score': round(best_metrics['f1'], 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Threshold=0.10 | Accuracy=0.7362, Precision=0.4282, Recall=0.8492, F1=0.5677
Threshold=0.15 | Accuracy=0.7901, Precision=0.4943, Recall=0.8147, F1=0.6126
Threshold=0.20 | Accuracy=0.8246, Precision=0.5511, Recall=0.7669, F1=0.6399
Threshold=0.25 | Accuracy=0.8426, Precision=0.6018, Recall=0.7186, F1=0.6506
Threshold=0.30 | Accuracy=0.8591, Precision=0.6572, Recall=0.6910, F1=0.6670
Threshold=0.35 | Accuracy=0.8660, Precision=0.6772, Recall=0.6775, F1=0.6716
Threshold=0.40 | Accuracy=0.8660, Precision=0.6964, Recall=0.6294, F1=0.6547
Threshold=0.45 | Accuracy=0.8674, Precision=0.7313, Recall=0.5814, F1=0.6382
Threshold=0.50 | Accuracy=0.8674, Precision=0.7489, Recall=0.5474, F1=0.6255

No threshold met strict criteria. Falling back to best F1 score.

--- Best Threshold Tuned Model Summary ---
Name                          : Stacking-CatLGBMSVCBBC
Description                   : StackingCatLGBMSVCBBC+LogRegMeta+ThreshTuned-0.35
Accuracy                      : 0.8660
Precision            

In [9]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# --- Prepare Data ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# --- Preprocessing ---
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Models ---
svc = Pipeline([
    ('pre', preprocessor),
    ('model', SVC(
        kernel='rbf',
        C=10, gamma=0.01,
        probability=True,
        random_state=42
    ))
])

adaboost = Pipeline([
    ('pre', preprocessor),
    ('model', AdaBoostClassifier(
        n_estimators=150,
        learning_rate=0.6,
        random_state=42
    ))
])

catboost = Pipeline([
    ('pre', preprocessor),
    ('model', CatBoostClassifier(
        iterations=250,
        learning_rate=0.04,
        depth=6,
        l2_leaf_reg=3,
        verbose=0,
        random_seed=42
    ))
])

# --- CV Setup ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# --- Weights & Threshold ---
weights = [1, 2, 2]  # svc, adaboost, catboost
threshold = 0.45

# --- 5-Fold Evaluation ---
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    svc.fit(X_train, y_train)
    adaboost.fit(X_train, y_train)
    catboost.fit(X_train, y_train)

    svc_proba = svc.predict_proba(X_val)[:, 1]
    ada_proba = adaboost.predict_proba(X_val)[:, 1]
    cat_proba = catboost.predict_proba(X_val)[:, 1]

    # Manual soft voting
    blended_proba = (
        weights[0] * svc_proba +
        weights[1] * ada_proba +
        weights[2] * cat_proba
    ) / sum(weights)

    y_pred = (blended_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# --- Final Metrics ---
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

model_name = 'ManualSoftVoting-Cat+Ada+SVC'
model_desc = 'ManualSoftVoting-Weights[1,2,2]-Thresh0.45'

print("\n--- Ensemble Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.8125, Recall=0.4483, F1=0.5778
Fold 2: Accuracy=0.8759, Precision=0.9231, Recall=0.4138, F1=0.5714
Fold 3: Accuracy=0.8552, Precision=0.6429, Recall=0.6207, F1=0.6316
Fold 4: Accuracy=0.8621, Precision=0.6923, Recall=0.6000, F1=0.6429
Fold 5: Accuracy=0.8750, Precision=0.7619, Recall=0.5517, F1=0.6400

--- Ensemble Model Summary ---
Name                          : ManualSoftVoting-Cat+Ada+SVC
Description                   : ManualSoftVoting-Weights[1,2,2]-Thresh0.45
Accuracy                      : 0.8674
Precision                     : 0.7665
Recall                        : 0.5269
F1 Score                      : 0.6127

CSV Row Format:
ManualSoftVoting-Cat+Ada+SVC,ManualSoftVoting-Weights[1,2,2]-Thresh0.45,0.8674,0.7665,0.5269,0.6127


In [11]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import StackingClassifier
from imblearn.ensemble import EasyEnsembleClassifier, BalancedBaggingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# --- Data ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# --- Preprocessing ---
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Base Estimators ---
eec = Pipeline([
    ('pre', preprocessor),
    ('model', EasyEnsembleClassifier(n_estimators=10, random_state=42))
])

bbc = Pipeline([
    ('pre', preprocessor),
    ('model', BalancedBaggingClassifier(
        estimator=LGBMClassifier(random_state=42),
        n_estimators=10,
        random_state=42
    ))
])

catboost = Pipeline([
    ('pre', preprocessor),
    ('model', CatBoostClassifier(
        iterations=250,
        learning_rate=0.04,
        depth=6,
        l2_leaf_reg=3,
        verbose=0,
        random_seed=42
    ))
])

# --- Meta Estimator ---
meta_model = RidgeClassifierCV()

# --- StackingClassifier ---
stacking_clf = StackingClassifier(
    estimators=[
        ('eec', eec),
        ('bbc', bbc),
        ('cat', catboost)
    ],
    final_estimator=meta_model,
    stack_method='predict_proba',
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# --- Evaluation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []
threshold = 0.4  # Tune between 0.3 - 0.5

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    stacking_clf.fit(X_train, y_train)

    # RidgeClassifierCV doesn’t support predict_proba
    try:
        decision_scores = stacking_clf.final_estimator_.decision_function(
            stacking_clf.transform(X_val)
        )
    except Exception:
        # fallback if transform not available
        decision_scores = stacking_clf.decision_function(X_val)

    y_pred = (decision_scores >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Acc={acc:.4f}, Prec={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# --- Results Summary ---
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'Stacking-EEC+BBC-LGBM+CatBoost'
model_desc = f'Stack[RidgeCV]-Thresh{threshold}'

print("\n--- Final Model Metrics ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Acc=0.8483, Prec=0.8889, Recall=0.2759, F1=0.4211
Fold 2: Acc=0.8690, Prec=1.0000, Recall=0.3448, F1=0.5128
Fold 3: Acc=0.8621, Prec=0.9091, Recall=0.3448, F1=0.5000
Fold 4: Acc=0.8552, Prec=0.7368, Recall=0.4667, F1=0.5714
Fold 5: Acc=0.8611, Prec=0.8462, Recall=0.3793, F1=0.5238

--- Final Model Metrics ---
Name                          : Stacking-EEC+BBC-LGBM+CatBoost
Description                   : Stack[RidgeCV]-Thresh0.4
Accuracy                      : 0.8591
Precision                     : 0.8762
Recall                        : 0.3623
F1 Score                      : 0.5058
