In [2]:
import pandas as pd
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Ensure the path to the DEModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/de_model"))
from de_handler import DEModelHandler  

# Ensure the path to the FSDModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/fsd_model"))
from fsd_handler import FSDModelHandler  

# Ensure the path to the Math3ModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/math3_model"))
from math3_handler import Math3ModelHandler  

# Ensure the path to the PythonModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/python_model"))
from python_handler import PythonModelHandler  

df = pd.read_csv("../dataset/train_dataset.csv")

# Drop the irrelevant, data leak columns
df_clean = df.drop(
    columns=[
        "Student ID",
        "Mentor-1",
        "Mentor-2",
        "Mentor-3",
        "Roll-2",
        "Roll-3",
        "Math-3 Theory",
        "DE Practical",
        "FSD Theory",
        "FSD Practical",
        "Python Theory",
        "Python Practical",
        "Communication Theory",
        "Law Theory",
    ]
)

# columns for Semester 1 core subjects
sem1_columns = [
    "Math-1 Theory",
    "Physics Theory",
    "Java-1 Theory",
    "Software Engineering Theory",
]

# Calculate Semester 1 Percentage as the average of core subject scores
# scores are numerical and out of 100
df_clean["Sem 1 Percentage"] = df_clean[sem1_columns].mean(axis=1).round(2)

# columns for Semester 2 core subjects
sem2_columns = [
    "Math-2 Theory",
    "Data Structures using Java Theory",
    "DBMS Theory",
    "Fundamental of Electronics and Electrical Theory",
    "Java-2 Theory",
]

# Calculate Semester 2 Percentage as the average of core subject scores
# scores are numerical and out of 100
df_clean["Sem 2 Percentage"] = df_clean[sem2_columns].mean(axis=1).round(2)

# Rename columns Div-1, Div-2, Div-3 to Section-1, Section-2, Section-3
df_clean = df_clean.rename(
    columns={"Div-1": "Section-1", "Div-2": "Section-2", "Div-3": "Section-3"}
)

# Transform values in Section-1, Section-2, Section-3 to keep only the first character
# Thus we get Only Department
for section in ["Section-1", "Section-2", "Section-3"]:
    df_clean[section] = df_clean[section].str[0]

# adding DE predicted column
preprocessor = DEModelHandler()
fe_de = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/de_model/de_model.joblib",
    return_type="df"
)

# Add the predicted DE Theory marks to df_clean
df_clean["Predicted DE Theory"] = fe_de["Predicted DE Theory"]


# adding FSD predicted column
preprocessor = FSDModelHandler()
fe_fsd = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/fsd_model/fsd_model.joblib",
    return_type="df"
)

# Add the predicted FSD Theory marks to df_clean
df_clean["Predicted FSD Theory"] = fe_fsd["Predicted FSD Theory"]


# adding Math3 predicted column
preprocessor = Math3ModelHandler()
fe_math3 = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/math3_model/math3_model.joblib",
    return_type="df"
)

# Add the predicted Math3 Theory marks to df_clean
df_clean["Predicted Math-3 Theory"] = fe_math3["Predicted Math-3 Theory"]


# adding Python predicted column
preprocessor = PythonModelHandler()
fe_python = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/python_model/python_model.joblib",
    return_type="df"
)

# Add the predicted Python Theory marks to df_clean
df_clean["Predicted Python Theory"] = fe_python["Predicted Python Theory"]

#  Calculate predicted Semester 3 percentage (mean of 4 predicted subject marks)
sem3_subjects = [
    "Predicted Math-3 Theory",
    "Predicted DE Theory",
    "Predicted FSD Theory",
    "Predicted Python Theory",
]

df_clean["Predicted Sem 3 Percentage"] = df_clean[sem3_subjects].mean(axis=1).round(2)

df_clean["Sem 1 Percentile"] = df_clean["Sem 1 Percentage"].rank(pct=True) * 100
df_clean["Sem 2 Percentile"] = df_clean["Sem 2 Percentage"].rank(pct=True) * 100
df_clean["Predicted Sem 3 Percentile"] = df_clean["Predicted Sem 3 Percentage"].rank(pct=True) * 100

# Round for consistency
df_clean[["Sem 1 Percentile", "Sem 2 Percentile", "Predicted Sem 3 Percentile"]] = df_clean[
    ["Sem 1 Percentile", "Sem 2 Percentile", "Predicted Sem 3 Percentile"]
].round(2)

df_clean["Predicted Percentile Drop"] = (
    df_clean["Sem 2 Percentile"] - df_clean["Predicted Sem 3 Percentile"]
).round(2)

df_clean["Predicted Risk Flag"] = df_clean["Predicted Percentile Drop"] > 10

# Columns for Semester 3 core theory subjects
sem3_columns = [
    "Math-3 Theory",
    "DE Theory",
    "FSD Theory",
    "Python Theory",
]

# Calculate Semester 3 Total as the sum of core subject scores
df["Sem 3 Percentage"] = (df[sem3_columns].sum(axis=1) / 4).round(2)

df_clean["Sem 3 Percentile"] = df["Sem 3 Percentage"].rank(pct=True) * 100

df_clean["Percentile Drop"] = (
    df_clean["Sem 2 Percentile"] - df_clean["Sem 3 Percentile"]
).round(2)

df_clean["Risk Flag"] = df_clean["Percentile Drop"] > 10

columns_to_drop = [
    "Sem 3 Percentile",
    "Percentile Drop"
]

df_clean.drop(columns=columns_to_drop, inplace=True)

# After all operations on df_clean are complete, drop other DataFrames
df = None
fe_de = None
fe_fsd = None
fe_math3 = None
fe_python = None

print(df_clean.head())

  Gender Religion Branch Section-1 Section-2 Section-3  Roll-1  Math-1 Theory  \
0      M    Hindu     CE         D         D         A     350             47   
1      F    Hindu    CST         B         B         D      18             84   
2      F    Hindu   AIML         A         A         C      23             74   
3      M    Hindu    CST         B         B         D     212             55   
4      M    Hindu    CST         B         B         D     208             38   

   Physics Theory  Physics Practical  ...  Predicted FSD Theory  \
0              48                 75  ...             72.266535   
1              83                 81  ...             87.523458   
2              85                 86  ...             89.409752   
3              69                 82  ...             79.807055   
4              59                 74  ...             56.474296   

   Predicted Math-3 Theory  Predicted Python Theory  \
0                56.352210                71.642156   


# Dummy

In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

# Target and features
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# DummyClassifier – always predicts the most frequent class
dummy = DummyClassifier(strategy='most_frequent')

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'DummyClassifier-MostFreq'
model_desc = 'Baseline-MostFrequent-5Fold'

# Print formatted summary
print("\n--- Baseline Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV (append row, create file if not exists)
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to file (header only if file doesn't exist)
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 2: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 3: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 4: Accuracy=0.7931, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 5: Accuracy=0.7986, Precision=0.0000, Recall=0.0000, F1=0.0000

--- Baseline Model Summary ---
Name                          : DummyClassifier-MostFreq
Description                   : Baseline-MostFrequent-5Fold
Accuracy                      : 0.7983
Precision                     : 0.0000
Recall                        : 0.0000
F1 Score                      : 0.0000

CSV Row Format:
DummyClassifier-MostFreq,Baseline-MostFrequent-5Fold,0.7983,0.0000,0.0000,0.0000


# Logistic Regression

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)  # convert bool to 0/1

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Full pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LogisticRegression(class_weight='balanced', max_iter=1000))
])

# Step 5: 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Step 6: Loop through CV
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'LogisticRegression-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Print: CSV-style with labels and formatting
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.8207, Precision=0.5385, Recall=0.7241, F1=0.6176
Fold 2: Accuracy=0.8552, Precision=0.6176, Recall=0.7241, F1=0.6667
Fold 3: Accuracy=0.7931, Precision=0.4889, Recall=0.7586, F1=0.5946
Fold 4: Accuracy=0.8069, Precision=0.5217, Recall=0.8000, F1=0.6316
Fold 5: Accuracy=0.8125, Precision=0.5250, Recall=0.7241, F1=0.6087

--- Average Metrics Summary ---
Name                          : LogisticRegression-Balanced
Description                   : OneHot+Scaler+5Fold-Stratified
Accuracy                      : 0.8177
Precision                     : 0.5383
Recall                        : 0.7462
F1 Score                      : 0.6238

CSV Row Format:
LogisticRegression-Balanced,OneHot+Scaler+5Fold-Stratified,0.8177,0.5383,0.7462,0.6238


# DecisionTreeClassifier

In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'DecisionTreeClassifier-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- DecisionTreeClassifier Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.5000,  Recall=0.4138, F1=0.4528
Fold 2: Accuracy=0.8414, Precision=0.7143,  Recall=0.3448, F1=0.4651
Fold 3: Accuracy=0.7517, Precision=0.4103,  Recall=0.5517, F1=0.4706
Fold 4: Accuracy=0.7862, Precision=0.4848,  Recall=0.5333, F1=0.5079
Fold 5: Accuracy=0.8403, Precision=0.6250,  Recall=0.5172, F1=0.5660

--- DecisionTreeClassifier Summary ---
Mean Accuracy : 0.8039
Mean Precision: 0.5469
Mean Recall   : 0.4722
Mean F1 Score : 0.4925

CSV Row Format:
DecisionTreeClassifier-Balanced,OneHot+Scaler+5Fold-Stratified,0.8039,0.5469,0.4722,0.4925


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Recall-tuned DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

# Step 5: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 6: Custom threshold
threshold = 0.35

# Step 7: Cross-validation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)

    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 8: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 9: Model info
model_name = 'DecisionTree-RecallTuned'
model_desc = f'Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold'

# Console output
print("\n--- DecisionTree_Recall_Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.6828, Precision=0.3455,  Recall=0.6552, F1=0.4524
Fold 2: Accuracy=0.8138, Precision=0.5357,  Recall=0.5172, F1=0.5263
Fold 3: Accuracy=0.7034, Precision=0.3750,  Recall=0.7241, F1=0.4941
Fold 4: Accuracy=0.7517, Precision=0.4348,  Recall=0.6667, F1=0.5263
Fold 5: Accuracy=0.7222, Precision=0.3878,  Recall=0.6552, F1=0.4872

--- DecisionTree_Recall_Tuned Summary ---
Mean Accuracy : 0.7348
Mean Precision: 0.4157
Mean Recall   : 0.6437
Mean F1 Score : 0.4973

CSV Row Format:
DecisionTree-RecallTuned,Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold,0.7348,0.4157,0.6437,0.4973


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Flexible (deep) DecisionTreeClassifier
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        max_depth=None,             # no limit
        min_samples_split=2,        # fine splits
        min_samples_leaf=1,         # small leaves allowed
        random_state=42
    ))
])

# Step 4: Cross-validation config
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 5: Threshold
threshold = 0.25  # aggressive threshold to maximize recall

# Step 6: CV loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Aggregate results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 8: Metadata
model_name = 'DecisionTree-MaxRecall'
model_desc = 'Depth=None|Split=2|Leaf=1|Thresh=0.25|5Fold'

print("\n--- DecisionTree_MaxRecall Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 9: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.5000,  Recall=0.4138, F1=0.4528
Fold 2: Accuracy=0.8414, Precision=0.7143,  Recall=0.3448, F1=0.4651
Fold 3: Accuracy=0.7517, Precision=0.4103,  Recall=0.5517, F1=0.4706
Fold 4: Accuracy=0.7862, Precision=0.4848,  Recall=0.5333, F1=0.5079
Fold 5: Accuracy=0.8403, Precision=0.6250,  Recall=0.5172, F1=0.5660

--- DecisionTree_MaxRecall Summary ---
Mean Accuracy : 0.8039
Mean Precision: 0.5469
Mean Recall   : 0.4722
Mean F1 Score : 0.4925

CSV Row Format:
DecisionTree-MaxRecall,Depth=None|Split=2|Leaf=1|Thresh=0.25|5Fold,0.8039,0.5469,0.4722,0.4925


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SMOTE setup
smote = SMOTE(random_state=42)

# Step 5: Recall-tuned DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

# Step 6: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 7: Custom threshold
threshold = 0.35

# Step 8: Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE to training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)

    # Fit the model on SMOTE-resampled data
    pipeline.named_steps['model'].fit(X_train_smote, y_train_smote)

    # Transform validation data
    X_val_preprocessed = preprocessor.transform(X_val)
    y_proba = pipeline.named_steps['model'].predict_proba(X_val_preprocessed)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 10: Model info
model_name = 'DecisionTree-SMOTE'
model_desc = f'Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold|SMOTE'

# Console output
print("\n--- DecisionTree_SMOTE Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 11: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7586, Precision=0.4400, Recall=0.7586, F1=0.5570
Fold 2: Accuracy=0.7517, Precision=0.4255, Recall=0.6897, F1=0.5263
Fold 3: Accuracy=0.7241, Precision=0.3922, Recall=0.6897, F1=0.5000
Fold 4: Accuracy=0.8069, Precision=0.5200, Recall=0.8667, F1=0.6500
Fold 5: Accuracy=0.7500, Precision=0.4340, Recall=0.7931, F1=0.5610

--- DecisionTree_SMOTE Summary ---
Mean Accuracy : 0.7583
Mean Precision: 0.4423
Mean Recall   : 0.7595
Mean F1 Score : 0.5589

CSV Row Format:
DecisionTree-SMOTE,Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold|SMOTE,0.7583,0.4423,0.7595,0.5589


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SMOTE setup with adjusted sampling strategy
smote = SMOTE(sampling_strategy=0.8, random_state=42)

# Step 5: Recall-optimized DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=3,
        random_state=42
    ))
])

# Step 6: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 7: Lowered threshold for higher recall
threshold = 0.25

# Step 8: Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE to training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)

    # Fit the model on SMOTE-resampled data
    pipeline.named_steps['model'].fit(X_train_smote, y_train_smote)

    # Transform validation data
    X_val_preprocessed = preprocessor.transform(X_val)
    y_proba = pipeline.named_steps['model'].predict_proba(X_val_preprocessed)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 10: Model info
model_name = 'DecisionTree-SMOTE-RecallOptimized'
model_desc = f'Thresh=0.25|Depth=6|Split=10|Leaf=3|5Fold|SMOTE_0.8'

# Console output
print("\n--- DecisionTree_SMOTE_RecallOptimized Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 11: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7931, Precision=0.4878, Recall=0.6897, F1=0.5714
Fold 2: Accuracy=0.7586, Precision=0.4167, Recall=0.5172, F1=0.4615
Fold 3: Accuracy=0.7103, Precision=0.3818, Recall=0.7241, F1=0.5000
Fold 4: Accuracy=0.7241, Precision=0.4107, Recall=0.7667, F1=0.5349
Fold 5: Accuracy=0.8125, Precision=0.5238, Recall=0.7586, F1=0.6197

--- DecisionTree_SMOTE_RecallOptimized Summary ---
Mean Accuracy : 0.7597
Mean Precision: 0.4442
Mean Recall   : 0.6913
Mean F1 Score : 0.5375

CSV Row Format:
DecisionTree-SMOTE-RecallOptimized,Thresh=0.25|Depth=6|Split=10|Leaf=3|5Fold|SMOTE_0.8,0.7597,0.4442,0.6913,0.5375


# RandomForestClassifier 

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline with Random Forest
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'RandomForestClassifier-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- RandomForestClassifier Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8138, Precision=0.7500,  Recall=0.1034, F1=0.1818
Fold 2: Accuracy=0.8276, Precision=0.8333,  Recall=0.1724, F1=0.2857
Fold 3: Accuracy=0.8345, Precision=0.8571,  Recall=0.2069, F1=0.3333
Fold 4: Accuracy=0.8345, Precision=0.8000,  Recall=0.2667, F1=0.4000
Fold 5: Accuracy=0.8403, Precision=0.8750,  Recall=0.2414, F1=0.3784

--- RandomForestClassifier Summary ---
Mean Accuracy : 0.8301
Mean Precision: 0.8231
Mean Recall   : 0.1982
Mean F1 Score : 0.3158

CSV Row Format:
RandomForestClassifier-Balanced,OneHot+Scaler+5Fold-Stratified,0.8301,0.8231,0.1982,0.3158


In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)  # convert bool to 0/1

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Full pipeline with SMOTE + RandomForest
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    ))
])

# Step 5: 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

threshold = 0.3  # Custom threshold to maximize recall

# Step 6: Loop through CV
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]  # Get probability for class 1

    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'RandomForest-SMOTE-Threshold0.3'
model_desc = 'OneHot+Scaler+SMOTE+RF+Threshold=0.3'

# Print: CSV-style with labels and formatting
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.7586, Precision=0.4348, Recall=0.6897, F1=0.5333
Fold 2: Accuracy=0.7724, Precision=0.4545, Recall=0.6897, F1=0.5479
Fold 3: Accuracy=0.6690, Precision=0.3582, Recall=0.8276, F1=0.5000
Fold 4: Accuracy=0.6552, Precision=0.3649, Recall=0.9000, F1=0.5192
Fold 5: Accuracy=0.6875, Precision=0.3621, Recall=0.7241, F1=0.4828

--- Average Metrics Summary ---
Name                          : RandomForest-SMOTE-Threshold0.3
Description                   : OneHot+Scaler+SMOTE+RF+Threshold=0.3
Accuracy                      : 0.7085
Precision                     : 0.3949
Recall                        : 0.7662
F1 Score                      : 0.5167

CSV Row Format:
RandomForest-SMOTE-Threshold0.3,OneHot+Scaler+SMOTE+RF+Threshold=0.3,0.7085,0.3949,0.7662,0.5167


# XGBoost

In [None]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: XGBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=np.sum(y == 0) / np.sum(y == 1),  # Handles class imbalance
        use_label_encoder=False,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'XGBoost-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8414, Precision=0.6500,  Recall=0.4483, F1=0.5306
Fold 2: Accuracy=0.8759, Precision=0.7619,  Recall=0.5517, F1=0.6400
Fold 3: Accuracy=0.8138, Precision=0.5278,  Recall=0.6552, F1=0.5846
Fold 4: Accuracy=0.8414, Precision=0.6061,  Recall=0.6667, F1=0.6349
Fold 5: Accuracy=0.8958, Precision=0.7917,  Recall=0.6552, F1=0.7170

--- XGBoost Summary ---
Mean Accuracy : 0.8536
Mean Precision: 0.6675
Mean Recall   : 0.5954
Mean F1 Score : 0.6214

CSV Row Format:
XGBoost-Balanced,OneHot+Scaler+5Fold-Stratified,0.8536,0.6675,0.5954,0.6214


In [15]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline with XGBoost + SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=4,  # 80:20 class balance
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 5: Stratified 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 6: Metrics storage
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Threshold for classification
threshold = 0.25

# Step 7: CV loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 8: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'XGBoost-SMOTE-Threshold0.25'
model_desc = 'OneHot+Scaler+SMOTE+XGB+Threshold=0.25'

# Step 9: Print metrics
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.8690, Precision=0.6250, Recall=0.8621, F1=0.7246
Fold 2: Accuracy=0.8621, Precision=0.6452, Recall=0.6897, F1=0.6667
Fold 3: Accuracy=0.7586, Precision=0.4400, Recall=0.7586, F1=0.5570
Fold 4: Accuracy=0.8345, Precision=0.5750, Recall=0.7667, F1=0.6571
Fold 5: Accuracy=0.8125, Precision=0.5238, Recall=0.7586, F1=0.6197

--- Average Metrics Summary ---
Name                          : XGBoost-SMOTE-Threshold0.25
Description                   : OneHot+Scaler+SMOTE+XGB+Threshold=0.25
Accuracy                      : 0.8273
Precision                     : 0.5618
Recall                        : 0.7671
F1 Score                      : 0.6450

CSV Row Format:
XGBoost-SMOTE-Threshold0.25,OneHot+Scaler+SMOTE+XGB+Threshold=0.25,0.8273,0.5618,0.7671,0.6450


In [None]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Pipeline with SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        random_state=42
    ))
])

# Step 4: Hyperparameter Optimization
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

# Step 5: Dynamic Threshold Tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
thresholds = np.linspace(0.1, 0.9, 9)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    # Dynamic threshold tuning for recall
    recalls = [recall_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(recalls)]

    y_pred = (y_proba >= best_threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Best Threshold={best_threshold:.2f}")

# Step 6: Aggregate Results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Summary
model_name = 'XGBoost-SMOTE-FineTuned'
model_desc = f'OptimizedParams|DynamicThresh={best_threshold:.2f}|OneHot+Scaler'

print("\n--- Fine-Tuned XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.01, 'model__max_depth': 6, 'model__min_child_weight': 3, 'model__n_estimators': 300, 'model__subsample': 0.8}
Fold 1: Accuracy=0.6207, Precision=0.3415, Recall=0.9655, F1=0.5045, Best Threshold=0.10
Fold 2: Accuracy=0.6138, Precision=0.3333, Recall=0.9310, F1=0.4909, Best Threshold=0.10
Fold 3: Accuracy=0.5586, Precision=0.3034, Recall=0.9310, F1=0.4576, Best Threshold=0.10
Fold 4: Accuracy=0.5448, Precision=0.3125, Recall=1.0000, F1=0.4762, Best Threshold=0.10
Fold 5: Accuracy=0.5000, Precision=0.2828, Recall=0.9655, F1=0.4375, Best Threshold=0.10

--- Fine-Tuned XGBoost Summary ---
Mean Accuracy : 0.5676
Mean Precision: 0.3147
Mean Recall   : 0.9586
Mean F1 Score : 0.4733


In [None]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Pipeline with SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        random_state=42
    ))
])

# Step 4: Hyperparameter Optimization
param_grid = {
    'model__n_estimators': [200, 300, 400],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__scale_pos_weight': [4, 5, 6]  # Adjusted for imbalance
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='balanced_accuracy',  # Balances true positive/negative rates
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

# Step 5: Dynamic Threshold Tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
thresholds = np.linspace(0.1, 0.9, 9)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    # Dynamic threshold tuning for F1
    f1_scores = [f1_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(f1_scores)]

    y_pred = (y_proba >= best_threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Best Threshold={best_threshold:.2f}")

# Step 6: Aggregate Results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Summary
model_name = 'XGBoost-SMOTE-FineTuned-Balanced'
model_desc = f'OptimizedParams|DynamicThresh={best_threshold:.2f}|OneHot+Scaler'

print("\n--- Fine-Tuned XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 8, 'model__min_child_weight': 5, 'model__n_estimators': 300, 'model__scale_pos_weight': 4, 'model__subsample': 0.8}
Fold 1: Accuracy=0.9241, Precision=0.8462, Recall=0.7586, F1=0.8000, Best Threshold=0.60
Fold 2: Accuracy=0.8759, Precision=0.6897, Recall=0.6897, F1=0.6897, Best Threshold=0.40
Fold 3: Accuracy=0.8207, Precision=0.5405, Recall=0.6897, F1=0.6061, Best Threshold=0.60
Fold 4: Accuracy=0.8552, Precision=0.6452, Recall=0.6667, F1=0.6557, Best Threshold=0.70
Fold 5: Accuracy=0.8681, Precision=0.6786, Recall=0.6552, F1=0.6667, Best Threshold=0.70

--- Fine-Tuned XGBoost Summary ---
Mean Accuracy : 0.8688
Mean Precision: 0.6800
Mean Recall   : 0.6920
Mean F1 Score : 0.6836


# LightGBM

In [12]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline (optimized to suppress warnings)
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        min_split_gain=0.01,
        min_child_samples=20,
        min_data_in_leaf=20,
        subsample=0.8,
        colsample_bytree=0.8,
        verbose=-1,              # suppress LightGBM internal logs
        random_state=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'LightGBM-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified+VerboseOff'

# Console summary
print("\n--- LightGBM Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8828, Precision=0.8000,  Recall=0.5517, F1=0.6531
Fold 2: Accuracy=0.8690, Precision=0.7273,  Recall=0.5517, F1=0.6275
Fold 3: Accuracy=0.8138, Precision=0.5294,  Recall=0.6207, F1=0.5714
Fold 4: Accuracy=0.8483, Precision=0.6333,  Recall=0.6333, F1=0.6333
Fold 5: Accuracy=0.8681, Precision=0.6786,  Recall=0.6552, F1=0.6667

--- LightGBM Summary ---
Mean Accuracy : 0.8564
Mean Precision: 0.6737
Mean Recall   : 0.6025
Mean F1 Score : 0.6304

CSV Row Format:
LightGBM-Balanced,OneHot+Scaler+5Fold-Stratified+VerboseOff,0.8564,0.6737,0.6025,0.6304


In [13]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Integer, Real

# Load your real df_clean before this step
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(objective='binary', class_weight='balanced', verbose=-1, random_state=42))
])

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

param_space = {
    'model__n_estimators': Integer(100, 500),
    'model__max_depth': Integer(3, 12),
    'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'model__min_child_samples': Integer(10, 100),
    'model__min_split_gain': Real(0.0, 0.2),
    'model__subsample': Real(0.6, 1.0),
    'model__colsample_bytree': Real(0.6, 1.0)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring=scoring,
    refit='recall',
    n_iter=40,
    cv=cv,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

opt.fit(X, y)

best_model = opt.best_estimator_
cv_results = cross_validate(best_model, X, y, cv=cv, scoring=scoring)

print("Final Tuned LightGBM Model Scores:")
print(f"Accuracy : {np.mean(cv_results['test_accuracy']):.4f}")
print(f"Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"Recall   : {np.mean(cv_results['test_recall']):.4f}")
print(f"F1 Score : {np.mean(cv_results['test_f1']):.4f}")
print("Best Parameters:", opt.best_params_)


Final Tuned LightGBM Model Scores:
Accuracy : 0.7459
Precision: 0.4328
Recall   : 0.8283
F1 Score : 0.5676
Best Parameters: OrderedDict({'model__colsample_bytree': 1.0, 'model__learning_rate': 0.01, 'model__max_depth': 12, 'model__min_child_samples': 100, 'model__min_split_gain': 0.2, 'model__n_estimators': 100, 'model__subsample': 0.8664183933116096})


In [16]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    precision_recall_curve
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline with best tuned params
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        learning_rate=0.01,
        max_depth=12,
        min_child_samples=100,
        min_split_gain=0.2,
        n_estimators=100,
        subsample=0.8664183933116096,
        colsample_bytree=1.0,
        verbose=-1,
        random_state=42
    ))
])

# Step 5: Cross-val predicted probabilities
# Step 5 (Modified): Threshold tuning with all constraints
precisions, recalls, thresholds = precision_recall_curve(y_val, y_prob)
best_thresh, best_f1 = 0.5, 0
final_pred = (y_prob >= 0.5).astype(int)

for p, r, t in zip(precisions, recalls, thresholds):
    pred = (y_prob >= t).astype(int)
    a = accuracy_score(y_val, pred)
    f1 = f1_score(y_val, pred)
    if r >= 0.85 and p > 0.50 and f1 > 0.60 and a > 0.70 and f1 > best_f1:
        best_thresh = t
        final_pred = pred
        best_f1 = f1

# Final metrics
acc = accuracy_score(y_val, final_pred)
prec = precision_score(y_val, final_pred, zero_division=0)
rec = recall_score(y_val, final_pred)
f1 = f1_score(y_val, final_pred)

accuracy_list.append(acc)
precision_list.append(prec)
recall_list.append(rec)
f1_list.append(f1)

print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Threshold={best_thresh:.4f}")


# Step 6: Find optimal threshold
precisions, recalls, thresholds = precision_recall_curve(y, y_probs)
valid = [(p, r, t) for p, r, t in zip(precisions, recalls, thresholds) if r >= 0.85 and p > 0.50]

if valid:
    best_prec, best_rec, best_thresh = max(valid, key=lambda x: 2*x[0]*x[1]/(x[0]+x[1]))
else:
    best_thresh = 0.5  # fallback
    best_prec = precision_score(y, y_probs >= best_thresh, zero_division=0)
    best_rec = recall_score(y, y_probs >= best_thresh)
    best_f1 = f1_score(y, y_probs >= best_thresh)
    print("No threshold met all conditions. Using default 0.5.")

# Step 7: Final metrics at optimal threshold
y_pred_final = (y_probs >= best_thresh).astype(int)
final_acc = accuracy_score(y, y_pred_final)
final_prec = precision_score(y, y_pred_final, zero_division=0)
final_rec = recall_score(y, y_pred_final)
final_f1 = f1_score(y, y_pred_final)

# Step 8: Print results
print("\n--- Threshold-Tuned LightGBM Results ---")
print(f"Threshold   : {best_thresh:.4f}")
print(f"Accuracy    : {final_acc:.4f}")
print(f"Precision   : {final_prec:.4f}")
print(f"Recall      : {final_rec:.4f}")
print(f"F1 Score    : {final_f1:.4f}")

# Step 9: Save to CSV
model_name = 'LightGBM-Tuned-Threshold'
model_desc = 'BayesOpt+Threshold@{:.4f}'.format(best_thresh)

csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(final_acc, 4),
    'Precision': round(final_prec, 4),
    'Recall': round(final_rec, 4),
    'F1 Score': round(final_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 5: Accuracy=0.8819, Precision=0.7308, Recall=0.6552, F1=0.6909, Threshold=0.5000
No threshold met all conditions. Using default 0.5.

--- Threshold-Tuned LightGBM Results ---
Threshold   : 0.5000
Accuracy    : 0.7459
Precision   : 0.4321
Recall      : 0.8288
F1 Score    : 0.5681


In [14]:
import os
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: LightGBM Pipeline with tuned parameters
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        n_estimators=200,           # Increased to allow more learning
        max_depth=8,                # Slightly deeper trees
        learning_rate=0.05,         # Lower for better convergence
        min_split_gain=0.01,
        min_child_samples=10,       # Lowered to capture smaller patterns
        min_data_in_leaf=10,        # Lowered to reduce overfitting
        subsample=0.8,
        colsample_bytree=0.7,      # Slightly reduced to increase diversity
        scale_pos_weight=3,         # Increase to prioritize positive class (tune based on imbalance)
        verbose=-1,
        random_state=42
    ))
])

# Step 5: Cross-validation with threshold tuning
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Get probability scores for threshold tuning
    y_prob = pipeline.predict_proba(X_val)[:, 1]

    # Find optimal threshold for recall >= 0.85
    precisions, recalls, thresholds = precision_recall_curve(y_val, y_prob)
    threshold = thresholds[np.argmax(recalls >= 0.85)] if np.any(recalls >= 0.85) else 0.5

    # Apply threshold to predictions
    y_pred = (y_prob >= threshold).astype(int)

    # Calculate metrics
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Threshold={threshold:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'LightGBM-Tuned-HighRecall'
model_desc = 'OneHot+Scaler+5Fold-Stratified+ThresholdTuned+VerboseOff'

# Console summary
print("\n--- LightGBM Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0000
Fold 2: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0000
Fold 3: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333, Threshold=0.0000
Fold 4: Accuracy=0.2069, Precision=0.2069, Recall=1.0000, F1=0.3429, Threshold=0.0001
Fold 5: Accuracy=0.2014, Precision=0.2014, Recall=1.0000, F1=0.3353, Threshold=0.0000

--- LightGBM Tuned Summary ---
Mean Accuracy : 0.2017
Mean Precision: 0.2017
Mean Recall   : 1.0000
Mean F1 Score : 0.3356

CSV Row Format:
LightGBM-Tuned-HighRecall,OneHot+Scaler+5Fold-Stratified+ThresholdTuned+VerboseOff,0.2017,0.2017,1.0000,0.3356


In [18]:
import os
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Classifier wrapper for threshold tuning
class ThresholdLGBMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, **params):
        self.model = LGBMClassifier(**params)
        self.threshold = 0.5

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        probas = self.model.predict_proba(X)[:, 1]
        return (probas >= self.threshold).astype(int)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

# Step 5: Objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 80),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "class_weight": "balanced",
        "random_state": 42,
        "verbose": -1
    }

    model = ThresholdLGBMClassifier(**params)
    pipeline = Pipeline([('prep', preprocessor), ('clf', model)])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    recalls = []

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        probas = pipeline.predict_proba(X_val)[:, 1]

        # Find best threshold to maximize recall >= 0.85
        best_recall, best_thresh = 0, 0.5
        for thresh in np.arange(0.3, 0.8, 0.02):
            preds = (probas >= thresh).astype(int)
            rec = recall_score(y_val, preds)
            if rec > best_recall:
                best_recall, best_thresh = rec, thresh

        model.threshold = best_thresh
        preds = (probas >= best_thresh).astype(int)

        recalls.append(recall_score(y_val, preds))

    return np.mean(recalls)

# Step 6: Tune with Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)
best_params = study.best_trial.params

# Step 7: Final Evaluation
model = ThresholdLGBMClassifier(**best_params)
pipeline = Pipeline([('prep', preprocessor), ('clf', model)])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    probas = pipeline.predict_proba(X_val)[:, 1]

    # Best threshold for this fold
    best_thresh, best_f1 = 0.5, 0
    for thresh in np.arange(0.3, 0.8, 0.01):
        preds = (probas >= thresh).astype(int)
        rec = recall_score(y_val, preds)
        prec = precision_score(y_val, preds, zero_division=0)
        f1_val = f1_score(y_val, preds)
        if rec >= 0.85 and prec > 0.5 and f1_val > best_f1:
            best_f1, best_thresh = f1_val, thresh

    model.threshold = best_thresh
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Threshold={best_thresh:.2f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

print("\n--- Final LightGBM Optimized ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# CSV logging
model_name = 'LightGBM-Optuna-Threshold'
model_desc = 'Optuna+ThresholdTuning+5Fold'
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


[I 2025-06-30 05:09:00,066] A new study created in memory with name: no-name-c48d41ea-eb14-4108-ae75-11e17b819aac
Best trial: 0. Best value: 0.629195:   2%|▏         | 1/50 [00:02<02:19,  2.85s/it]

[I 2025-06-30 05:09:02,917] Trial 0 finished with value: 0.6291954022988506 and parameters: {'n_estimators': 128, 'learning_rate': 0.03776545153747731, 'max_depth': 12, 'num_leaves': 56, 'min_child_samples': 10, 'subsample': 0.6797151887864643, 'colsample_bytree': 0.9726666766437326}. Best is trial 0 with value: 0.6291954022988506.


Best trial: 1. Best value: 0.855632:   4%|▍         | 2/50 [00:04<01:38,  2.05s/it]

[I 2025-06-30 05:09:04,407] Trial 1 finished with value: 0.8556321839080461 and parameters: {'n_estimators': 220, 'learning_rate': 0.01515917719006464, 'max_depth': 8, 'num_leaves': 72, 'min_child_samples': 38, 'subsample': 0.8869344835853608, 'colsample_bytree': 0.6894328510243152}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 1. Best value: 0.855632:   6%|▌         | 3/50 [00:06<01:37,  2.08s/it]

[I 2025-06-30 05:09:06,519] Trial 2 finished with value: 0.5956321839080461 and parameters: {'n_estimators': 258, 'learning_rate': 0.07828428468323015, 'max_depth': 9, 'num_leaves': 53, 'min_child_samples': 18, 'subsample': 0.66243541441678, 'colsample_bytree': 0.7890728599838094}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 1. Best value: 0.855632:   8%|▊         | 4/50 [00:08<01:33,  2.04s/it]

[I 2025-06-30 05:09:08,506] Trial 3 finished with value: 0.5744827586206895 and parameters: {'n_estimators': 153, 'learning_rate': 0.07961652575559582, 'max_depth': 12, 'num_leaves': 57, 'min_child_samples': 11, 'subsample': 0.6899152521677238, 'colsample_bytree': 0.7060759649880425}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 1. Best value: 0.855632:  10%|█         | 5/50 [00:09<01:19,  1.77s/it]

[I 2025-06-30 05:09:09,785] Trial 4 finished with value: 0.6912643678160919 and parameters: {'n_estimators': 216, 'learning_rate': 0.035066117641882125, 'max_depth': 12, 'num_leaves': 45, 'min_child_samples': 34, 'subsample': 0.9581957080437951, 'colsample_bytree': 0.8581737811615229}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 1. Best value: 0.855632:  12%|█▏        | 6/50 [00:10<01:10,  1.59s/it]

[I 2025-06-30 05:09:11,041] Trial 5 finished with value: 0.7117241379310345 and parameters: {'n_estimators': 288, 'learning_rate': 0.058908920513606064, 'max_depth': 3, 'num_leaves': 59, 'min_child_samples': 41, 'subsample': 0.8276717555398595, 'colsample_bytree': 0.6119590093341611}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 1. Best value: 0.855632:  14%|█▍        | 7/50 [00:12<01:02,  1.44s/it]

[I 2025-06-30 05:09:12,174] Trial 6 finished with value: 0.8422988505747127 and parameters: {'n_estimators': 145, 'learning_rate': 0.020027728843667776, 'max_depth': 9, 'num_leaves': 38, 'min_child_samples': 32, 'subsample': 0.8161878452787625, 'colsample_bytree': 0.7647461096847645}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 1. Best value: 0.855632:  16%|█▌        | 8/50 [00:12<00:52,  1.24s/it]

[I 2025-06-30 05:09:12,980] Trial 7 finished with value: 0.6503448275862068 and parameters: {'n_estimators': 187, 'learning_rate': 0.12409588708599914, 'max_depth': 3, 'num_leaves': 30, 'min_child_samples': 29, 'subsample': 0.9466140942009941, 'colsample_bytree': 0.6691447296779649}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 1. Best value: 0.855632:  18%|█▊        | 9/50 [00:14<00:50,  1.23s/it]

[I 2025-06-30 05:09:14,176] Trial 8 finished with value: 0.6022988505747126 and parameters: {'n_estimators': 177, 'learning_rate': 0.12655146444971932, 'max_depth': 12, 'num_leaves': 75, 'min_child_samples': 31, 'subsample': 0.8463003927517372, 'colsample_bytree': 0.7327360768221782}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 1. Best value: 0.855632:  20%|██        | 10/50 [00:15<00:50,  1.25s/it]

[I 2025-06-30 05:09:15,495] Trial 9 finished with value: 0.6367816091954023 and parameters: {'n_estimators': 268, 'learning_rate': 0.1454777266352102, 'max_depth': 8, 'num_leaves': 43, 'min_child_samples': 48, 'subsample': 0.9610068594373843, 'colsample_bytree': 0.9758773445151506}. Best is trial 1 with value: 0.8556321839080461.


Best trial: 10. Best value: 0.951724:  22%|██▏       | 11/50 [00:16<00:44,  1.14s/it]

[I 2025-06-30 05:09:16,367] Trial 10 finished with value: 0.9517241379310345 and parameters: {'n_estimators': 100, 'learning_rate': 0.010846831452193811, 'max_depth': 6, 'num_leaves': 75, 'min_child_samples': 50, 'subsample': 0.7498808268079575, 'colsample_bytree': 0.8889084088320774}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  24%|██▍       | 12/50 [00:17<00:43,  1.14s/it]

[I 2025-06-30 05:09:17,526] Trial 11 finished with value: 0.8898850574712645 and parameters: {'n_estimators': 225, 'learning_rate': 0.011287629080143627, 'max_depth': 6, 'num_leaves': 79, 'min_child_samples': 50, 'subsample': 0.7414237504989598, 'colsample_bytree': 0.8730079552686579}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  26%|██▌       | 13/50 [00:18<00:38,  1.05s/it]

[I 2025-06-30 05:09:18,372] Trial 12 finished with value: 0.9517241379310345 and parameters: {'n_estimators': 100, 'learning_rate': 0.010224727401565112, 'max_depth': 5, 'num_leaves': 79, 'min_child_samples': 50, 'subsample': 0.7344573466105763, 'colsample_bytree': 0.8815827007641857}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  28%|██▊       | 14/50 [00:19<00:36,  1.00s/it]

[I 2025-06-30 05:09:19,265] Trial 13 finished with value: 0.8282758620689655 and parameters: {'n_estimators': 103, 'learning_rate': 0.04704378358158536, 'max_depth': 5, 'num_leaves': 67, 'min_child_samples': 45, 'subsample': 0.6009461906519354, 'colsample_bytree': 0.8975010479548778}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  30%|███       | 15/50 [00:20<00:34,  1.02it/s]

[I 2025-06-30 05:09:20,192] Trial 14 finished with value: 0.7733333333333332 and parameters: {'n_estimators': 101, 'learning_rate': 0.059678619405680665, 'max_depth': 5, 'num_leaves': 68, 'min_child_samples': 44, 'subsample': 0.7562732295873451, 'colsample_bytree': 0.9195805392233071}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  32%|███▏      | 16/50 [00:21<00:33,  1.01it/s]

[I 2025-06-30 05:09:21,204] Trial 15 finished with value: 0.6639080459770115 and parameters: {'n_estimators': 123, 'learning_rate': 0.10015836216042864, 'max_depth': 6, 'num_leaves': 22, 'min_child_samples': 23, 'subsample': 0.7551142669459798, 'colsample_bytree': 0.8296311911973553}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  34%|███▍      | 17/50 [00:22<00:34,  1.04s/it]

[I 2025-06-30 05:09:22,349] Trial 16 finished with value: 0.8489655172413793 and parameters: {'n_estimators': 149, 'learning_rate': 0.029263119056792394, 'max_depth': 5, 'num_leaves': 80, 'min_child_samples': 37, 'subsample': 0.7187782045209624, 'colsample_bytree': 0.9323223059546191}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  36%|███▌      | 18/50 [00:23<00:32,  1.00s/it]

[I 2025-06-30 05:09:23,269] Trial 17 finished with value: 0.7255172413793103 and parameters: {'n_estimators': 168, 'learning_rate': 0.05579666979857561, 'max_depth': 7, 'num_leaves': 64, 'min_child_samples': 50, 'subsample': 0.6352440536060159, 'colsample_bytree': 0.8378563661451135}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  38%|███▊      | 19/50 [00:24<00:29,  1.05it/s]

[I 2025-06-30 05:09:24,114] Trial 18 finished with value: 0.7255172413793105 and parameters: {'n_estimators': 122, 'learning_rate': 0.10007784035383992, 'max_depth': 4, 'num_leaves': 72, 'min_child_samples': 44, 'subsample': 0.7806943849897323, 'colsample_bytree': 0.9943507334366697}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  40%|████      | 20/50 [00:25<00:29,  1.00it/s]

[I 2025-06-30 05:09:25,202] Trial 19 finished with value: 0.8282758620689655 and parameters: {'n_estimators': 103, 'learning_rate': 0.02405228790605149, 'max_depth': 10, 'num_leaves': 62, 'min_child_samples': 26, 'subsample': 0.8827458733608705, 'colsample_bytree': 0.9380399539121217}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  42%|████▏     | 21/50 [00:26<00:29,  1.01s/it]

[I 2025-06-30 05:09:26,254] Trial 20 finished with value: 0.7871264367816092 and parameters: {'n_estimators': 131, 'learning_rate': 0.0439603969531188, 'max_depth': 7, 'num_leaves': 80, 'min_child_samples': 41, 'subsample': 0.7195268080632596, 'colsample_bytree': 0.7972989192455759}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  44%|████▍     | 22/50 [00:27<00:29,  1.06s/it]

[I 2025-06-30 05:09:27,441] Trial 21 finished with value: 0.8967816091954024 and parameters: {'n_estimators': 226, 'learning_rate': 0.01037528849905573, 'max_depth': 6, 'num_leaves': 76, 'min_child_samples': 49, 'subsample': 0.7390810822435588, 'colsample_bytree': 0.8768983604715099}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  46%|████▌     | 23/50 [00:28<00:30,  1.13s/it]

[I 2025-06-30 05:09:28,716] Trial 22 finished with value: 0.8967816091954024 and parameters: {'n_estimators': 236, 'learning_rate': 0.011007248615682306, 'max_depth': 6, 'num_leaves': 74, 'min_child_samples': 47, 'subsample': 0.7869216700868891, 'colsample_bytree': 0.8865150673793363}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  48%|████▊     | 24/50 [00:29<00:28,  1.10s/it]

[I 2025-06-30 05:09:29,748] Trial 23 finished with value: 0.8147126436781609 and parameters: {'n_estimators': 199, 'learning_rate': 0.030906387944571075, 'max_depth': 4, 'num_leaves': 68, 'min_child_samples': 41, 'subsample': 0.7060381837782608, 'colsample_bytree': 0.8210247653181326}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  50%|█████     | 25/50 [00:30<00:27,  1.09s/it]

[I 2025-06-30 05:09:30,823] Trial 24 finished with value: 0.8285057471264368 and parameters: {'n_estimators': 245, 'learning_rate': 0.023045639398731263, 'max_depth': 4, 'num_leaves': 75, 'min_child_samples': 46, 'subsample': 0.6536449940697424, 'colsample_bytree': 0.9041894404710098}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  52%|█████▏    | 26/50 [00:32<00:28,  1.18s/it]

[I 2025-06-30 05:09:32,196] Trial 25 finished with value: 0.8556321839080461 and parameters: {'n_estimators': 297, 'learning_rate': 0.011474715808528919, 'max_depth': 6, 'num_leaves': 70, 'min_child_samples': 50, 'subsample': 0.7726291860917229, 'colsample_bytree': 0.8587917231507504}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  54%|█████▍    | 27/50 [00:33<00:26,  1.17s/it]

[I 2025-06-30 05:09:33,361] Trial 26 finished with value: 0.6641379310344828 and parameters: {'n_estimators': 197, 'learning_rate': 0.06820888918498764, 'max_depth': 7, 'num_leaves': 49, 'min_child_samples': 36, 'subsample': 0.7308240028346595, 'colsample_bytree': 0.9531499455692264}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  56%|█████▌    | 28/50 [00:34<00:24,  1.11s/it]

[I 2025-06-30 05:09:34,335] Trial 27 finished with value: 0.7457471264367815 and parameters: {'n_estimators': 164, 'learning_rate': 0.04620710487340524, 'max_depth': 5, 'num_leaves': 65, 'min_child_samples': 42, 'subsample': 0.8044646989020551, 'colsample_bytree': 0.7711526860115611}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  58%|█████▊    | 29/50 [00:35<00:21,  1.04s/it]

[I 2025-06-30 05:09:35,210] Trial 28 finished with value: 0.8967816091954024 and parameters: {'n_estimators': 117, 'learning_rate': 0.02432855997973176, 'max_depth': 4, 'num_leaves': 76, 'min_child_samples': 47, 'subsample': 0.8519064068973364, 'colsample_bytree': 0.8522363175598463}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  60%|██████    | 30/50 [00:36<00:20,  1.03s/it]

[I 2025-06-30 05:09:36,227] Trial 29 finished with value: 0.8078160919540232 and parameters: {'n_estimators': 135, 'learning_rate': 0.029869354427433298, 'max_depth': 6, 'num_leaves': 60, 'min_child_samples': 39, 'subsample': 0.6837340022116123, 'colsample_bytree': 0.962105385255174}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  62%|██████▏   | 31/50 [00:37<00:20,  1.10s/it]

[I 2025-06-30 05:09:37,466] Trial 30 finished with value: 0.684367816091954 and parameters: {'n_estimators': 114, 'learning_rate': 0.04115242987748787, 'max_depth': 8, 'num_leaves': 77, 'min_child_samples': 18, 'subsample': 0.6250207732260842, 'colsample_bytree': 0.8897642913381976}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  64%|██████▍   | 32/50 [00:38<00:20,  1.12s/it]

[I 2025-06-30 05:09:38,635] Trial 31 finished with value: 0.8967816091954024 and parameters: {'n_estimators': 239, 'learning_rate': 0.011106695601456587, 'max_depth': 6, 'num_leaves': 73, 'min_child_samples': 48, 'subsample': 0.7867866157603952, 'colsample_bytree': 0.8899075035349885}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  66%|██████▌   | 33/50 [00:39<00:19,  1.14s/it]

[I 2025-06-30 05:09:39,824] Trial 32 finished with value: 0.8489655172413793 and parameters: {'n_estimators': 211, 'learning_rate': 0.018862451704580917, 'max_depth': 7, 'num_leaves': 71, 'min_child_samples': 46, 'subsample': 0.7624874045342221, 'colsample_bytree': 0.9153053705024481}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  68%|██████▊   | 34/50 [00:40<00:17,  1.12s/it]

[I 2025-06-30 05:09:40,900] Trial 33 finished with value: 0.8556321839080461 and parameters: {'n_estimators': 233, 'learning_rate': 0.018220733426403637, 'max_depth': 5, 'num_leaves': 75, 'min_child_samples': 50, 'subsample': 0.7029017813552011, 'colsample_bytree': 0.8788076264736664}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  70%|███████   | 35/50 [00:42<00:17,  1.14s/it]

[I 2025-06-30 05:09:42,082] Trial 34 finished with value: 0.7050574712643678 and parameters: {'n_estimators': 250, 'learning_rate': 0.035149376106872646, 'max_depth': 6, 'num_leaves': 80, 'min_child_samples': 43, 'subsample': 0.7987377969076876, 'colsample_bytree': 0.8117581655719849}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  72%|███████▏  | 36/50 [00:43<00:16,  1.17s/it]

[I 2025-06-30 05:09:43,321] Trial 35 finished with value: 0.8213793103448275 and parameters: {'n_estimators': 263, 'learning_rate': 0.016237062072846423, 'max_depth': 7, 'num_leaves': 71, 'min_child_samples': 47, 'subsample': 0.6716335774585689, 'colsample_bytree': 0.8477009821405823}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  74%|███████▍  | 37/50 [00:44<00:14,  1.13s/it]

[I 2025-06-30 05:09:44,353] Trial 36 finished with value: 0.8967816091954024 and parameters: {'n_estimators': 209, 'learning_rate': 0.010209488495447649, 'max_depth': 9, 'num_leaves': 65, 'min_child_samples': 47, 'subsample': 0.735802892225231, 'colsample_bytree': 0.8653466017617061}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  76%|███████▌  | 38/50 [00:45<00:14,  1.17s/it]

[I 2025-06-30 05:09:45,600] Trial 37 finished with value: 0.6298850574712643 and parameters: {'n_estimators': 277, 'learning_rate': 0.09468699595133678, 'max_depth': 5, 'num_leaves': 57, 'min_child_samples': 39, 'subsample': 0.8318048649502692, 'colsample_bytree': 0.9397086554872351}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  78%|███████▊  | 39/50 [00:46<00:12,  1.13s/it]

[I 2025-06-30 05:09:46,662] Trial 38 finished with value: 0.8149425287356322 and parameters: {'n_estimators': 223, 'learning_rate': 0.037374936322662144, 'max_depth': 3, 'num_leaves': 73, 'min_child_samples': 12, 'subsample': 0.8768792251534108, 'colsample_bytree': 0.776078461610752}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  80%|████████  | 40/50 [00:47<00:11,  1.15s/it]

[I 2025-06-30 05:09:47,868] Trial 39 finished with value: 0.8285057471264368 and parameters: {'n_estimators': 185, 'learning_rate': 0.02491091476835732, 'max_depth': 8, 'num_leaves': 54, 'min_child_samples': 48, 'subsample': 0.9221694618987195, 'colsample_bytree': 0.9139007478444008}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  82%|████████▏ | 41/50 [00:48<00:10,  1.12s/it]

[I 2025-06-30 05:09:48,922] Trial 40 finished with value: 0.6641379310344827 and parameters: {'n_estimators': 142, 'learning_rate': 0.08897428618663465, 'max_depth': 10, 'num_leaves': 77, 'min_child_samples': 34, 'subsample': 0.6962596419270926, 'colsample_bytree': 0.7488255532473076}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  84%|████████▍ | 42/50 [00:49<00:08,  1.05s/it]

[I 2025-06-30 05:09:49,799] Trial 41 finished with value: 0.9174712643678161 and parameters: {'n_estimators': 114, 'learning_rate': 0.019164896408901823, 'max_depth': 4, 'num_leaves': 77, 'min_child_samples': 45, 'subsample': 0.8251403034166988, 'colsample_bytree': 0.8466864679702043}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  86%|████████▌ | 43/50 [00:50<00:06,  1.01it/s]

[I 2025-06-30 05:09:50,657] Trial 42 finished with value: 0.9174712643678161 and parameters: {'n_estimators': 113, 'learning_rate': 0.018583469151776725, 'max_depth': 4, 'num_leaves': 78, 'min_child_samples': 45, 'subsample': 0.8156153745216101, 'colsample_bytree': 0.8789255404048535}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  88%|████████▊ | 44/50 [00:51<00:05,  1.07it/s]

[I 2025-06-30 05:09:51,469] Trial 43 finished with value: 0.9448275862068964 and parameters: {'n_estimators': 112, 'learning_rate': 0.018618279300258726, 'max_depth': 3, 'num_leaves': 78, 'min_child_samples': 44, 'subsample': 0.8598547098729525, 'colsample_bytree': 0.8119986338395975}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  90%|█████████ | 45/50 [00:52<00:04,  1.06it/s]

[I 2025-06-30 05:09:52,429] Trial 44 finished with value: 0.8967816091954024 and parameters: {'n_estimators': 107, 'learning_rate': 0.03071062565970814, 'max_depth': 3, 'num_leaves': 36, 'min_child_samples': 44, 'subsample': 0.8619744456042926, 'colsample_bytree': 0.8113438752303485}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  92%|█████████▏| 46/50 [00:53<00:03,  1.06it/s]

[I 2025-06-30 05:09:53,356] Trial 45 finished with value: 0.9448275862068964 and parameters: {'n_estimators': 115, 'learning_rate': 0.017312648754869767, 'max_depth': 3, 'num_leaves': 77, 'min_child_samples': 45, 'subsample': 0.9138153608664417, 'colsample_bytree': 0.795237556732132}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  94%|█████████▍| 47/50 [00:54<00:02,  1.11it/s]

[I 2025-06-30 05:09:54,160] Trial 46 finished with value: 0.8489655172413793 and parameters: {'n_estimators': 128, 'learning_rate': 0.05341266900058069, 'max_depth': 3, 'num_leaves': 68, 'min_child_samples': 39, 'subsample': 0.9257856314291424, 'colsample_bytree': 0.7365534124782015}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  96%|█████████▌| 48/50 [00:54<00:01,  1.12it/s]

[I 2025-06-30 05:09:55,042] Trial 47 finished with value: 0.8970114942528736 and parameters: {'n_estimators': 141, 'learning_rate': 0.037661595052264574, 'max_depth': 3, 'num_leaves': 70, 'min_child_samples': 42, 'subsample': 0.9834707804100716, 'colsample_bytree': 0.708487696816511}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724:  98%|█████████▊| 49/50 [00:55<00:00,  1.10it/s]

[I 2025-06-30 05:09:55,980] Trial 48 finished with value: 0.7324137931034483 and parameters: {'n_estimators': 155, 'learning_rate': 0.0697816054263257, 'max_depth': 4, 'num_leaves': 78, 'min_child_samples': 35, 'subsample': 0.9122810385839624, 'colsample_bytree': 0.7884019519564875}. Best is trial 10 with value: 0.9517241379310345.


Best trial: 10. Best value: 0.951724: 100%|██████████| 50/50 [00:56<00:00,  1.13s/it]


[I 2025-06-30 05:09:56,711] Trial 49 finished with value: 0.7457471264367815 and parameters: {'n_estimators': 113, 'learning_rate': 0.12523752096680826, 'max_depth': 3, 'num_leaves': 23, 'min_child_samples': 45, 'subsample': 0.9055562812309266, 'colsample_bytree': 0.8340747245825457}. Best is trial 10 with value: 0.9517241379310345.
Fold 1: Accuracy=0.8345, Precision=1.0000, Recall=0.1724, F1=0.2941, Threshold=0.50
Fold 2: Accuracy=0.8552, Precision=1.0000, Recall=0.2759, F1=0.4324, Threshold=0.50
Fold 3: Accuracy=0.8621, Precision=0.9091, Recall=0.3448, F1=0.5000, Threshold=0.50
Fold 4: Accuracy=0.8621, Precision=0.8571, Recall=0.4000, F1=0.5455, Threshold=0.50
Fold 5: Accuracy=0.8333, Precision=1.0000, Recall=0.1724, F1=0.2941, Threshold=0.50

--- Final LightGBM Optimized ---
Mean Accuracy : 0.8494
Mean Precision: 0.9532
Mean Recall   : 0.2731
Mean F1 Score : 0.4132


# CatBoost

In [20]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: CatBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        depth=6,
        auto_class_weights='Balanced',
        verbose=0,  # suppress CatBoost internal logs
        random_seed=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'CatBoost-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified+VerboseOff'

# Console summary
print("\n--- CatBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.7083,  Recall=0.5862, F1=0.6415
Fold 2: Accuracy=0.8483, Precision=0.7059,  Recall=0.4138, F1=0.5217
Fold 3: Accuracy=0.8414, Precision=0.5938,  Recall=0.6552, F1=0.6230
Fold 4: Accuracy=0.8483, Precision=0.6250,  Recall=0.6667, F1=0.6452
Fold 5: Accuracy=0.8750, Precision=0.6774,  Recall=0.7241, F1=0.7000

--- CatBoost Summary ---
Mean Accuracy : 0.8564
Mean Precision: 0.6621
Mean Recall   : 0.6092
Mean F1 Score : 0.6263

CSV Row Format:
CatBoost-Balanced,OneHot+Scaler+5Fold-Stratified+VerboseOff,0.8564,0.6621,0.6092,0.6263


In [21]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Tuned CatBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=5,
        border_count=128,
        bagging_temperature=1.0,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'CatBoost-Tuned'
model_desc = 'OneHot+Scaler+5Fold+Depth8+LR0.05+BagTemp1.0'

# Console summary
print("\n--- CatBoost Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8759, Precision=0.7895,  Recall=0.5172, F1=0.6250
Fold 2: Accuracy=0.8483, Precision=0.7059,  Recall=0.4138, F1=0.5217
Fold 3: Accuracy=0.8345, Precision=0.5806,  Recall=0.6207, F1=0.6000
Fold 4: Accuracy=0.8414, Precision=0.6207,  Recall=0.6000, F1=0.6102
Fold 5: Accuracy=0.8681, Precision=0.7273,  Recall=0.5517, F1=0.6275

--- CatBoost Tuned Summary ---
Mean Accuracy : 0.8536
Mean Precision: 0.6848
Mean Recall   : 0.5407
Mean F1 Score : 0.5969

CSV Row Format:
CatBoost-Tuned,OneHot+Scaler+5Fold+Depth8+LR0.05+BagTemp1.0,0.8536,0.6848,0.5407,0.5969


In [22]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Optional: Check constant columns
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Define column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Aggressively Tuned CatBoost
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', CatBoostClassifier(
        iterations=500,
        learning_rate=0.03,
        depth=10,
        l2_leaf_reg=3,
        border_count=128,
        bagging_temperature=0.25,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Description
model_name = 'CatBoost-Aggressive'
model_desc = 'OneHot+Scaler+500Iter+LR0.03+Depth10+Bag0.25'

print("\n--- CatBoost Aggressive Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8828, Precision=0.8750,  Recall=0.4828, F1=0.6222
Fold 2: Accuracy=0.8414, Precision=0.6875,  Recall=0.3793, F1=0.4889
Fold 3: Accuracy=0.8483, Precision=0.6296,  Recall=0.5862, F1=0.6071
Fold 4: Accuracy=0.8552, Precision=0.6667,  Recall=0.6000, F1=0.6316
Fold 5: Accuracy=0.8472, Precision=0.6667,  Recall=0.4828, F1=0.5600

--- CatBoost Aggressive Summary ---
Mean Accuracy : 0.8550
Mean Precision: 0.7051
Mean Recall   : 0.5062
Mean F1 Score : 0.5820

CSV Row Format:
CatBoost-Aggressive,OneHot+Scaler+500Iter+LR0.03+Depth10+Bag0.25,0.8550,0.7051,0.5062,0.5820


In [4]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Constant column check
constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
if constant_cols:
    print("Warning: Constant columns detected:", constant_cols)

# Categorical and numerical columns
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Define CatBoost with default settings (will be tuned)
cat_model = CatBoostClassifier(
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=0
)

# Create pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', cat_model)
])

# Define parameter search space
param_space = {
    'model__iterations': Integer(300, 800),
    'model__learning_rate': Real(0.01, 0.2, prior='log-uniform'),
    'model__depth': Integer(4, 10),
    'model__l2_leaf_reg': Real(1, 10),
    'model__bagging_temperature': Real(0, 1.0),
    'model__border_count': Integer(32, 254)
}

# Setup Bayesian optimization with 5-fold stratified CV
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Fit the search
opt.fit(X, y)

# Extract best pipeline and evaluate manually
best_pipeline = opt.best_estimator_

# Manual 5-Fold Eval
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Averages
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'CatBoost-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+5Fold'

print("\n--- CatBoost Bayesian Tuning Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8897, Precision=0.7826, Recall=0.6207, F1=0.6923
Fold 2: Accuracy=0.8621, Precision=0.7143, Recall=0.5172, F1=0.6000
Fold 3: Accuracy=0.8138, Precision=0.5263, Recall=0.6897, F1=0.5970
Fold 4: Accuracy=0.8621, Precision=0.6471, Recall=0.7333, F1=0.6875
Fold 5: Accuracy=0.8819, Precision=0.7000, Recall=0.7241, F1=0.7119

--- CatBoost Bayesian Tuning Summary ---
Mean Accuracy : 0.8619
Mean Precision: 0.6741
Mean Recall   : 0.6570
Mean F1 Score : 0.6577


# SVM

In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# SVM model inside pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Mean scores
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'SVC-RBF-Pipeline'
model_desc = 'OneHot+Scaler+5Fold+Balanced'

print("\n--- SVM (RBF) Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8069, Precision=0.5143, Recall=0.6207, F1=0.5625
Fold 2: Accuracy=0.8207, Precision=0.5556, Recall=0.5172, F1=0.5357
Fold 3: Accuracy=0.7655, Precision=0.4444, Recall=0.6897, F1=0.5405
Fold 4: Accuracy=0.7793, Precision=0.4783, Recall=0.7333, F1=0.5789
Fold 5: Accuracy=0.7222, Precision=0.3878, Recall=0.6552, F1=0.4872

--- SVM (RBF) Summary ---
Name                          : SVC-RBF-Pipeline
Description                   : OneHot+Scaler+5Fold+Balanced
Accuracy                      : 0.7789
Precision                     : 0.4761
Recall                        : 0.6432
F1 Score                      : 0.5410

CSV Row Format:
SVC-RBF-Pipeline,OneHot+Scaler+5Fold+Balanced,0.7789,0.4761,0.6432,0.5410


In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8414, Precision=0.6000, Recall=0.6207, F1=0.6102
Fold 2: Accuracy=0.8414, Precision=0.6364, Recall=0.4828, F1=0.5490
Fold 3: Accuracy=0.8552, Precision=0.6429, Recall=0.6207, F1=0.6316
Fold 4: Accuracy=0.8069, Precision=0.5357, Recall=0.5000, F1=0.5172
Fold 5: Accuracy=0.8333, Precision=0.5862, Recall=0.5862, F1=0.5862

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.8356
Mean Precision: 0.6002
Mean Recall   : 0.5621
Mean F1 Score : 0.5788

CSV Row Format:
SVC-BayesTuned,OneHot+Scaler+BayesSearch+RBF+Balanced,0.8356,0.6002,0.5621,0.5788


In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8414, Precision=0.6000, Recall=0.6207, F1=0.6102
Fold 2: Accuracy=0.8414, Precision=0.6364, Recall=0.4828, F1=0.5490
Fold 3: Accuracy=0.8552, Precision=0.6429, Recall=0.6207, F1=0.6316
Fold 4: Accuracy=0.8069, Precision=0.5357, Recall=0.5000, F1=0.5172
Fold 5: Accuracy=0.8333, Precision=0.5862, Recall=0.5862, F1=0.5862

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.8356
Mean Precision: 0.6002
Mean Recall   : 0.5621
Mean F1 Score : 0.5788

CSV Row Format:
SVC-BayesTuned,OneHot+Scaler+BayesSearch+RBF+Balanced,0.8356,0.6002,0.5621,0.5788


In [9]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV
from skopt.space import Real
import warnings
warnings.filterwarnings('ignore')

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SVC pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVC(probability=True, random_state=42))
])

# Step 5: Define search space for BayesSearchCV
param_space = {
    'model__C': Real(0.1, 100, prior='log-uniform'),
    'model__gamma': Real(1e-4, 1.0, prior='log-uniform'),
    'model__kernel': ['rbf'],
    'model__class_weight': ['balanced']
}

# Step 6: Bayesian optimization focused on RECALL
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    scoring='recall',  # prioritize recall
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=32,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Step 7: Fit optimizer
opt.fit(X, y)
best_pipeline = opt.best_estimator_

# Step 8: Manual evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list, prec_list, rec_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    acc_list.append(acc)
    prec_list.append(prec)
    rec_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Aggregate metrics
mean_acc = np.mean(acc_list)
mean_prec = np.mean(prec_list)
mean_rec = np.mean(rec_list)
mean_f1 = np.mean(f1_list)

model_name = 'SVC-BayesTuned-Recall'
model_desc = 'OneHot+Scaler+BayesSearch+RBF+Balanced+RecallOpt'

print("\n--- Tuned SVM (RBF) Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333
Fold 2: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333
Fold 3: Accuracy=0.2000, Precision=0.2000, Recall=1.0000, F1=0.3333
Fold 4: Accuracy=0.2069, Precision=0.2069, Recall=1.0000, F1=0.3429
Fold 5: Accuracy=0.2014, Precision=0.2014, Recall=1.0000, F1=0.3353

--- Tuned SVM (RBF) Summary ---
Mean Accuracy : 0.2017
Mean Precision: 0.2017
Mean Recall   : 1.0000
Mean F1 Score : 0.3356

CSV Row Format:
SVC-BayesTuned-Recall,OneHot+Scaler+BayesSearch+RBF+Balanced+RecallOpt,0.2017,0.2017,1.0000,0.3356


# Bagging

In [12]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Bagging Classifier pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        n_estimators=50,
        max_samples=0.8,
        max_features=1.0,
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    ))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'Bagging-DecisionTree'
model_desc = 'Bagging-with-Preprocessing-5Fold'

print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8828, Precision=0.8750, Recall=0.4828, F1=0.6222
Fold 2: Accuracy=0.8552, Precision=0.7857, Recall=0.3793, F1=0.5116
Fold 3: Accuracy=0.8759, Precision=0.7391, Recall=0.5862, F1=0.6538
Fold 4: Accuracy=0.8759, Precision=0.8000, Recall=0.5333, F1=0.6400
Fold 5: Accuracy=0.8472, Precision=0.7333, Recall=0.3793, F1=0.5000

--- Model Summary ---
Name                          : Bagging-DecisionTree
Description                   : Bagging-with-Preprocessing-5Fold
Accuracy                      : 0.8674
Precision                     : 0.7866
Recall                        : 0.4722
F1 Score                      : 0.5855

CSV Row Format:
Bagging-DecisionTree,Bagging-with-Preprocessing-5Fold,0.8674,0.7866,0.4722,0.5855


In [13]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        random_state=42,
        n_jobs=-1
    ))
])

# Parameter search space for Bagging + Decision Tree
search_space = {
    'model__n_estimators': Integer(10, 100),
    'model__max_samples': Real(0.5, 1.0),
    'model__max_features': Real(0.5, 1.0),
    'model__estimator__max_depth': Integer(2, 20),
    'model__estimator__min_samples_split': Integer(2, 10),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# BayesSearchCV setup (recall as scoring metric)
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit
bayes_search.fit(X, y)

# Best model
best_model = bayes_search.best_estimator_

# 5-Fold Evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'Bagging+DT-Tuned'
model_desc = 'BayesCV-Tuned-Recall-Max-5Fold'

print("\n--- Final Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.7778, Recall=0.4828, F1=0.5957
Fold 2: Accuracy=0.8552, Precision=0.7857, Recall=0.3793, F1=0.5116
Fold 3: Accuracy=0.8621, Precision=0.6800, Recall=0.5862, F1=0.6296
Fold 4: Accuracy=0.8828, Precision=0.7600, Recall=0.6333, F1=0.6909
Fold 5: Accuracy=0.8750, Precision=0.8235, Recall=0.4828, F1=0.6087

--- Final Tuned Model Summary ---
Name                          : Bagging+DT-Tuned
Description                   : BayesCV-Tuned-Recall-Max-5Fold
Accuracy                      : 0.8688
Precision                     : 0.7654
Recall                        : 0.5129
F1 Score                      : 0.6073

CSV Row Format:
Bagging+DT-Tuned,BayesCV-Tuned-Recall-Max-5Fold,0.8688,0.7654,0.5129,0.6073


In [14]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# Data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        random_state=42,
        n_jobs=-1
    ))
])

# Parameter search space for Bagging + Decision Tree
search_space = {
    'model__n_estimators': Integer(10, 100),
    'model__max_samples': Real(0.5, 1.0),
    'model__max_features': Real(0.5, 1.0),
    'model__estimator__max_depth': Integer(2, 20),
    'model__estimator__min_samples_split': Integer(2, 10),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# BayesSearchCV setup (recall as scoring metric)
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit
bayes_search.fit(X, y)

# Best model
best_model = bayes_search.best_estimator_

# 5-Fold Evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Metadata
model_name = 'Bagging+DT-Tuned'
model_desc = 'BayesCV-Tuned-Recall-Max-5Fold'

print("\n--- Final Tuned Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.7778, Recall=0.4828, F1=0.5957
Fold 2: Accuracy=0.8552, Precision=0.7857, Recall=0.3793, F1=0.5116
Fold 3: Accuracy=0.8621, Precision=0.6800, Recall=0.5862, F1=0.6296
Fold 4: Accuracy=0.8828, Precision=0.7600, Recall=0.6333, F1=0.6909
Fold 5: Accuracy=0.8750, Precision=0.8235, Recall=0.4828, F1=0.6087

--- Final Tuned Model Summary ---
Name                          : Bagging+DT-Tuned
Description                   : BayesCV-Tuned-Recall-Max-5Fold
Accuracy                      : 0.8688
Precision                     : 0.7654
Recall                        : 0.5129
F1 Score                      : 0.6073

CSV Row Format:
Bagging+DT-Tuned,BayesCV-Tuned-Recall-Max-5Fold,0.8688,0.7654,0.5129,0.6073


In [15]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

# --- Data Setup ---
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# --- Pipeline ---
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', BaggingClassifier(
        estimator=DecisionTreeClassifier(class_weight='balanced'),
        n_estimators=50,
        max_samples=0.8,
        max_features=1.0,
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    ))
])

# --- Search Space ---
search_space = {
    'model__n_estimators': Integer(20, 100),
    'model__max_samples': Real(0.4, 1.0),
    'model__max_features': Real(0.4, 1.0),
    'model__estimator__max_depth': Integer(3, 20),
    'model__estimator__min_samples_split': Integer(2, 15),
    'model__estimator__min_samples_leaf': Integer(1, 10)
}

# --- Tuning ---
bayes_search = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    scoring=make_scorer(recall_score),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=30,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# --- Fit ---
bayes_search.fit(X, y)
best_model = bayes_search.best_estimator_

# --- Cross-Validation Evaluation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# --- Final Metrics ---
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# --- Output ---
model_name = 'Bagging+DT-Balanced-Tuned'
model_desc = 'BaggingDT+Balanced+BayesCV-Recall'

print("\n--- Final Tuned Bagging Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# --- Save to CSV ---
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8345, Precision=0.5610, Recall=0.7931, F1=0.6571
Fold 2: Accuracy=0.8000, Precision=0.5000, Recall=0.7241, F1=0.5915
Fold 3: Accuracy=0.7379, Precision=0.4211, Recall=0.8276, F1=0.5581
Fold 4: Accuracy=0.7586, Precision=0.4576, Recall=0.9000, F1=0.6067
Fold 5: Accuracy=0.7431, Precision=0.4231, Recall=0.7586, F1=0.5432

--- Final Tuned Bagging Model Summary ---
Name                          : Bagging+DT-Balanced-Tuned
Description                   : BaggingDT+Balanced+BayesCV-Recall
Accuracy                      : 0.7748
Precision                     : 0.4725
Recall                        : 0.8007
F1 Score                      : 0.5914

CSV Row Format:
Bagging+DT-Balanced-Tuned,BaggingDT+Balanced+BayesCV-Recall,0.7748,0.4725,0.8007,0.5914


# AdaBoostClassifier

In [17]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ]
)

# AdaBoost model pipeline
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', AdaBoostClassifier(random_state=42))
])

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'AdaBoostClassifier'
model_desc = 'AdaBoost-5Fold-Preprocessed'

# Print formatted summary
print("\n--- Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV (append row, create file if not exists)
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8690, Precision=0.8125, Recall=0.4483, F1=0.5778
Fold 2: Accuracy=0.8690, Precision=0.9167, Recall=0.3793, F1=0.5366
Fold 3: Accuracy=0.8690, Precision=0.7273, Recall=0.5517, F1=0.6275
Fold 4: Accuracy=0.8552, Precision=0.6957, Recall=0.5333, F1=0.6038
Fold 5: Accuracy=0.8819, Precision=0.7500, Recall=0.6207, F1=0.6792

--- Model Summary ---
Name                          : AdaBoostClassifier
Description                   : AdaBoost-5Fold-Preprocessed
Accuracy                      : 0.8688
Precision                     : 0.7804
Recall                        : 0.5067
F1 Score                      : 0.6050

CSV Row Format:
AdaBoostClassifier,AdaBoost-5Fold-Preprocessed,0.8688,0.7804,0.5067,0.6050


In [20]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from skopt import BayesSearchCV
from skopt.space import Integer, Real

import warnings
warnings.filterwarnings('ignore')

# Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# AdaBoost pipeline
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', AdaBoostClassifier(algorithm='SAMME', random_state=42))
])

# Hyperparameter search space (no 'SAMME.R')
search_space = {
    'classifier__n_estimators': Integer(50, 300),
    'classifier__learning_rate': Real(0.01, 1.0, prior='log-uniform')
}

# CV and tuner
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
opt = BayesSearchCV(
    pipe,
    search_spaces=search_space,
    scoring='recall',
    n_iter=25,
    cv=cv,
    n_jobs=-1,
    verbose=0,
    random_state=42
)

# Fit the tuner
opt.fit(X, y)

# Final best model
best_model = opt.best_estimator_

# CV metric evaluation using best model
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'AdaBoostClassifier-Tuned'
model_desc = f"AdaBoost-Tuned-SAMME-{opt.best_params_}"

# Print summary
print("\n--- Tuned AdaBoost Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# CSV write
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8828, Precision=0.8000, Recall=0.5517, F1=0.6531
Fold 2: Accuracy=0.8621, Precision=0.8000, Recall=0.4138, F1=0.5455
Fold 3: Accuracy=0.8621, Precision=0.7143, Recall=0.5172, F1=0.6000
Fold 4: Accuracy=0.8483, Precision=0.6538, Recall=0.5667, F1=0.6071
Fold 5: Accuracy=0.8542, Precision=0.6538, Recall=0.5862, F1=0.6182

--- Tuned AdaBoost Summary ---
Name                          : AdaBoostClassifier-Tuned
Description                   : AdaBoost-Tuned-SAMME-OrderedDict({'classifier__learning_rate': 1.0, 'classifier__n_estimators': 180})
Accuracy                      : 0.8619
Precision                     : 0.7244
Recall                        : 0.5271
F1 Score                      : 0.6048
