In [1]:
import pandas as pd
import sys
import os

# Ensure the path to the DEModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/de_model"))
from de_handler import DEModelHandler  

# Ensure the path to the FSDModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/fsd_model"))
from fsd_handler import FSDModelHandler  

# Ensure the path to the Math3ModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/math3_model"))
from math3_handler import Math3ModelHandler  

# Ensure the path to the PythonModelHandler is correct
sys.path.append(os.path.abspath("../SubjectModels/python_model"))
from python_handler import PythonModelHandler  

df = pd.read_csv("../dataset/train_dataset.csv")

# Drop the irrelevant, data leak columns
df_clean = df.drop(
    columns=[
        "Student ID",
        "Mentor-1",
        "Mentor-2",
        "Mentor-3",
        "Roll-2",
        "Roll-3",
        "Math-3 Theory",
        "DE Practical",
        "FSD Theory",
        "FSD Practical",
        "Python Theory",
        "Python Practical",
        "Communication Theory",
        "Law Theory",
    ]
)

# columns for Semester 1 core subjects
sem1_columns = [
    "Math-1 Theory",
    "Physics Theory",
    "Java-1 Theory",
    "Software Engineering Theory",
]

# Calculate Semester 1 Percentage as the average of core subject scores
# scores are numerical and out of 100
df_clean["Sem 1 Percentage"] = df_clean[sem1_columns].mean(axis=1).round(2)

# columns for Semester 2 core subjects
sem2_columns = [
    "Math-2 Theory",
    "Data Structures using Java Theory",
    "DBMS Theory",
    "Fundamental of Electronics and Electrical Theory",
    "Java-2 Theory",
]

# Calculate Semester 2 Percentage as the average of core subject scores
# scores are numerical and out of 100
df_clean["Sem 2 Percentage"] = df_clean[sem2_columns].mean(axis=1).round(2)

# Rename columns Div-1, Div-2, Div-3 to Section-1, Section-2, Section-3
df_clean = df_clean.rename(
    columns={"Div-1": "Section-1", "Div-2": "Section-2", "Div-3": "Section-3"}
)

# Transform values in Section-1, Section-2, Section-3 to keep only the first character
# Thus we get Only Department
for section in ["Section-1", "Section-2", "Section-3"]:
    df_clean[section] = df_clean[section].str[0]

# adding DE predicted column
preprocessor = DEModelHandler()
fe_de = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/de_model/de_model.joblib",
    return_type="df"
)

# Add the predicted DE Theory marks to df_clean
df_clean["Predicted DE Theory"] = fe_de["Predicted DE Theory"]


# adding FSD predicted column
preprocessor = FSDModelHandler()
fe_fsd = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/fsd_model/fsd_model.joblib",
    return_type="df"
)

# Add the predicted FSD Theory marks to df_clean
df_clean["Predicted FSD Theory"] = fe_fsd["Predicted FSD Theory"]


# adding Math3 predicted column
preprocessor = Math3ModelHandler()
fe_math3 = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/math3_model/math3_model.joblib",
    return_type="df"
)

# Add the predicted Math3 Theory marks to df_clean
df_clean["Predicted Math-3 Theory"] = fe_math3["Predicted Math-3 Theory"]


# adding Python predicted column
preprocessor = PythonModelHandler()
fe_python = preprocessor.predict_from_model(
    df,
    model_path="../SubjectModels/python_model/python_model.joblib",
    return_type="df"
)

# Add the predicted Python Theory marks to df_clean
df_clean["Predicted Python Theory"] = fe_python["Predicted Python Theory"]

#  Calculate predicted Semester 3 percentage (mean of 4 predicted subject marks)
sem3_subjects = [
    "Predicted Math-3 Theory",
    "Predicted DE Theory",
    "Predicted FSD Theory",
    "Predicted Python Theory",
]

df_clean["Predicted Sem 3 Percentage"] = df_clean[sem3_subjects].mean(axis=1).round(2)

df_clean["Sem 1 Percentile"] = df_clean["Sem 1 Percentage"].rank(pct=True) * 100
df_clean["Sem 2 Percentile"] = df_clean["Sem 2 Percentage"].rank(pct=True) * 100
df_clean["Predicted Sem 3 Percentile"] = df_clean["Predicted Sem 3 Percentage"].rank(pct=True) * 100

# Round for consistency
df_clean[["Sem 1 Percentile", "Sem 2 Percentile", "Predicted Sem 3 Percentile"]] = df_clean[
    ["Sem 1 Percentile", "Sem 2 Percentile", "Predicted Sem 3 Percentile"]
].round(2)

df_clean["Predicted Percentile Drop"] = (
    df_clean["Sem 2 Percentile"] - df_clean["Predicted Sem 3 Percentile"]
).round(2)

df_clean["Predicted Risk Flag"] = df_clean["Predicted Percentile Drop"] > 10

# Columns for Semester 3 core theory subjects
sem3_columns = [
    "Math-3 Theory",
    "DE Theory",
    "FSD Theory",
    "Python Theory",
]

# Calculate Semester 3 Total as the sum of core subject scores
df["Sem 3 Percentage"] = (df[sem3_columns].sum(axis=1) / 4).round(2)

df_clean["Sem 3 Percentile"] = df["Sem 3 Percentage"].rank(pct=True) * 100

df_clean["Percentile Drop"] = (
    df_clean["Sem 2 Percentile"] - df_clean["Sem 3 Percentile"]
).round(2)

df_clean["Risk Flag"] = df_clean["Percentile Drop"] > 10

columns_to_drop = [
    "Sem 3 Percentile",
    "Percentile Drop"
]

df_clean.drop(columns=columns_to_drop, inplace=True)

print(df_clean.head())

  Gender Religion Branch Section-1 Section-2 Section-3  Roll-1  Math-1 Theory  \
0      M    Hindu     CE         D         D         A     350             47   
1      F    Hindu    CST         B         B         D      18             84   
2      F    Hindu   AIML         A         A         C      23             74   
3      M    Hindu    CST         B         B         D     212             55   
4      M    Hindu    CST         B         B         D     208             38   

   Physics Theory  Physics Practical  ...  Predicted FSD Theory  \
0              48                 75  ...             72.266535   
1              83                 81  ...             87.523458   
2              85                 86  ...             89.409752   
3              69                 82  ...             79.807055   
4              59                 74  ...             56.474296   

   Predicted Math-3 Theory  Predicted Python Theory  \
0                56.352210                71.642156   


# Dummy

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Target and features
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# DummyClassifier – always predicts the most frequent class
dummy = DummyClassifier(strategy='most_frequent')

# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metric storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# 5-Fold Evaluation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Final metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'DummyClassifier-MostFreq'
model_desc = 'Baseline-MostFrequent-5Fold'

# Print formatted summary
print("\n--- Baseline Model Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Save to CSV (append row, create file if not exists)
csv_file = "risk_model_metrics.csv"
new_row = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to file (header only if file doesn't exist)
new_row.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 2: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 3: Accuracy=0.8000, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 4: Accuracy=0.7931, Precision=0.0000, Recall=0.0000, F1=0.0000
Fold 5: Accuracy=0.7986, Precision=0.0000, Recall=0.0000, F1=0.0000

--- Baseline Model Summary ---
Name                          : DummyClassifier-MostFreq
Description                   : Baseline-MostFrequent-5Fold
Accuracy                      : 0.7983
Precision                     : 0.0000
Recall                        : 0.0000
F1 Score                      : 0.0000

CSV Row Format:
DummyClassifier-MostFreq,Baseline-MostFrequent-5Fold,0.7983,0.0000,0.0000,0.0000


# Logistic Regression

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)  # convert bool to 0/1

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Full pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', LogisticRegression(class_weight='balanced', max_iter=1000))
])

# Step 5: 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Step 6: Loop through CV
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'LogisticRegression-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Print: CSV-style with labels and formatting
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.8207, Precision=0.5385, Recall=0.7241, F1=0.6176
Fold 2: Accuracy=0.8552, Precision=0.6176, Recall=0.7241, F1=0.6667
Fold 3: Accuracy=0.7931, Precision=0.4889, Recall=0.7586, F1=0.5946
Fold 4: Accuracy=0.8069, Precision=0.5217, Recall=0.8000, F1=0.6316
Fold 5: Accuracy=0.8125, Precision=0.5250, Recall=0.7241, F1=0.6087

--- Average Metrics Summary ---
Name                          : LogisticRegression-Balanced
Description                   : OneHot+Scaler+5Fold-Stratified
Accuracy                      : 0.8177
Precision                     : 0.5383
Recall                        : 0.7462
F1 Score                      : 0.6238

CSV Row Format:
LogisticRegression-Balanced,OneHot+Scaler+5Fold-Stratified,0.8177,0.5383,0.7462,0.6238


# DecisionTreeClassifier

In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'DecisionTreeClassifier-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- DecisionTreeClassifier Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.5000,  Recall=0.4138, F1=0.4528
Fold 2: Accuracy=0.8414, Precision=0.7143,  Recall=0.3448, F1=0.4651
Fold 3: Accuracy=0.7517, Precision=0.4103,  Recall=0.5517, F1=0.4706
Fold 4: Accuracy=0.7862, Precision=0.4848,  Recall=0.5333, F1=0.5079
Fold 5: Accuracy=0.8403, Precision=0.6250,  Recall=0.5172, F1=0.5660

--- DecisionTreeClassifier Summary ---
Mean Accuracy : 0.8039
Mean Precision: 0.5469
Mean Recall   : 0.4722
Mean F1 Score : 0.4925

CSV Row Format:
DecisionTreeClassifier-Balanced,OneHot+Scaler+5Fold-Stratified,0.8039,0.5469,0.4722,0.4925


In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Recall-tuned DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

# Step 5: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 6: Custom threshold
threshold = 0.35

# Step 7: Cross-validation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)

    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 8: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 9: Model info
model_name = 'DecisionTree-RecallTuned'
model_desc = f'Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold'

# Console output
print("\n--- DecisionTree_Recall_Tuned Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.6828, Precision=0.3455,  Recall=0.6552, F1=0.4524
Fold 2: Accuracy=0.8138, Precision=0.5357,  Recall=0.5172, F1=0.5263
Fold 3: Accuracy=0.7034, Precision=0.3750,  Recall=0.7241, F1=0.4941
Fold 4: Accuracy=0.7517, Precision=0.4348,  Recall=0.6667, F1=0.5263
Fold 5: Accuracy=0.7222, Precision=0.3878,  Recall=0.6552, F1=0.4872

--- DecisionTree_Recall_Tuned Summary ---
Mean Accuracy : 0.7348
Mean Precision: 0.4157
Mean Recall   : 0.6437
Mean F1 Score : 0.4973

CSV Row Format:
DecisionTree-RecallTuned,Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold,0.7348,0.4157,0.6437,0.4973


In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Flexible (deep) DecisionTreeClassifier
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        max_depth=None,             # no limit
        min_samples_split=2,        # fine splits
        min_samples_leaf=1,         # small leaves allowed
        random_state=42
    ))
])

# Step 4: Cross-validation config
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 5: Threshold
threshold = 0.25  # aggressive threshold to maximize recall

# Step 6: CV loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Aggregate results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 8: Metadata
model_name = 'DecisionTree-MaxRecall'
model_desc = 'Depth=None|Split=2|Leaf=1|Thresh=0.25|5Fold'

print("\n--- DecisionTree_MaxRecall Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 9: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8000, Precision=0.5000,  Recall=0.4138, F1=0.4528
Fold 2: Accuracy=0.8414, Precision=0.7143,  Recall=0.3448, F1=0.4651
Fold 3: Accuracy=0.7517, Precision=0.4103,  Recall=0.5517, F1=0.4706
Fold 4: Accuracy=0.7862, Precision=0.4848,  Recall=0.5333, F1=0.5079
Fold 5: Accuracy=0.8403, Precision=0.6250,  Recall=0.5172, F1=0.5660

--- DecisionTree_MaxRecall Summary ---
Mean Accuracy : 0.8039
Mean Precision: 0.5469
Mean Recall   : 0.4722
Mean F1 Score : 0.4925

CSV Row Format:
DecisionTree-MaxRecall,Depth=None|Split=2|Leaf=1|Thresh=0.25|5Fold,0.8039,0.5469,0.4722,0.4925


In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SMOTE setup
smote = SMOTE(random_state=42)

# Step 5: Recall-tuned DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

# Step 6: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 7: Custom threshold
threshold = 0.35

# Step 8: Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE to training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)

    # Fit the model on SMOTE-resampled data
    pipeline.named_steps['model'].fit(X_train_smote, y_train_smote)

    # Transform validation data
    X_val_preprocessed = preprocessor.transform(X_val)
    y_proba = pipeline.named_steps['model'].predict_proba(X_val_preprocessed)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 10: Model info
model_name = 'DecisionTree-SMOTE'
model_desc = f'Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold|SMOTE'

# Console output
print("\n--- DecisionTree_SMOTE Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 11: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7586, Precision=0.4400, Recall=0.7586, F1=0.5570
Fold 2: Accuracy=0.7517, Precision=0.4255, Recall=0.6897, F1=0.5263
Fold 3: Accuracy=0.7241, Precision=0.3922, Recall=0.6897, F1=0.5000
Fold 4: Accuracy=0.8069, Precision=0.5200, Recall=0.8667, F1=0.6500
Fold 5: Accuracy=0.7500, Precision=0.4340, Recall=0.7931, F1=0.5610

--- DecisionTree_SMOTE Summary ---
Mean Accuracy : 0.7583
Mean Precision: 0.4423
Mean Recall   : 0.7595
Mean F1 Score : 0.5589

CSV Row Format:
DecisionTree-SMOTE,Thresh=0.35|Depth=6|Split=10|Leaf=5|5Fold|SMOTE,0.7583,0.4423,0.7595,0.5589


In [10]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Step 1: Data setup
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column types
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: SMOTE setup with adjusted sampling strategy
smote = SMOTE(sampling_strategy=0.8, random_state=42)

# Step 5: Recall-optimized DecisionTree
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=3,
        random_state=42
    ))
])

# Step 6: CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Step 7: Lowered threshold for higher recall
threshold = 0.25

# Step 8: Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply SMOTE to training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)

    # Fit the model on SMOTE-resampled data
    pipeline.named_steps['model'].fit(X_train_smote, y_train_smote)

    # Transform validation data
    X_val_preprocessed = preprocessor.transform(X_val)
    y_proba = pipeline.named_steps['model'].predict_proba(X_val_preprocessed)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 9: Mean metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 10: Model info
model_name = 'DecisionTree-SMOTE-RecallOptimized'
model_desc = f'Thresh=0.25|Depth=6|Split=10|Leaf=3|5Fold|SMOTE_0.8'

# Console output
print("\n--- DecisionTree_SMOTE_RecallOptimized Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 11: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))

Fold 1: Accuracy=0.7931, Precision=0.4878, Recall=0.6897, F1=0.5714
Fold 2: Accuracy=0.7586, Precision=0.4167, Recall=0.5172, F1=0.4615
Fold 3: Accuracy=0.7103, Precision=0.3818, Recall=0.7241, F1=0.5000
Fold 4: Accuracy=0.7241, Precision=0.4107, Recall=0.7667, F1=0.5349
Fold 5: Accuracy=0.8125, Precision=0.5238, Recall=0.7586, F1=0.6197

--- DecisionTree_SMOTE_RecallOptimized Summary ---
Mean Accuracy : 0.7597
Mean Precision: 0.4442
Mean Recall   : 0.6913
Mean F1 Score : 0.5375

CSV Row Format:
DecisionTree-SMOTE-RecallOptimized,Thresh=0.25|Depth=6|Split=10|Leaf=3|5Fold|SMOTE_0.8,0.7597,0.4442,0.6913,0.5375


# RandomForestClassifier 

In [11]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline with Random Forest
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'RandomForestClassifier-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- RandomForestClassifier Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8138, Precision=0.7500,  Recall=0.1034, F1=0.1818
Fold 2: Accuracy=0.8276, Precision=0.8333,  Recall=0.1724, F1=0.2857
Fold 3: Accuracy=0.8345, Precision=0.8571,  Recall=0.2069, F1=0.3333
Fold 4: Accuracy=0.8345, Precision=0.8000,  Recall=0.2667, F1=0.4000
Fold 5: Accuracy=0.8403, Precision=0.8750,  Recall=0.2414, F1=0.3784

--- RandomForestClassifier Summary ---
Mean Accuracy : 0.8301
Mean Precision: 0.8231
Mean Recall   : 0.1982
Mean F1 Score : 0.3158

CSV Row Format:
RandomForestClassifier-Balanced,OneHot+Scaler+5Fold-Stratified,0.8301,0.8231,0.1982,0.3158


In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)  # convert bool to 0/1

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Full pipeline with SMOTE + RandomForest
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    ))
])

# Step 5: 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

threshold = 0.3  # Custom threshold to maximize recall

# Step 6: Loop through CV
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]  # Get probability for class 1

    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 7: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'RandomForest-SMOTE-Threshold0.3'
model_desc = 'OneHot+Scaler+SMOTE+RF+Threshold=0.3'

# Print: CSV-style with labels and formatting
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.7586, Precision=0.4348, Recall=0.6897, F1=0.5333
Fold 2: Accuracy=0.7724, Precision=0.4545, Recall=0.6897, F1=0.5479
Fold 3: Accuracy=0.6690, Precision=0.3582, Recall=0.8276, F1=0.5000
Fold 4: Accuracy=0.6552, Precision=0.3649, Recall=0.9000, F1=0.5192
Fold 5: Accuracy=0.6875, Precision=0.3621, Recall=0.7241, F1=0.4828

--- Average Metrics Summary ---
Name                          : RandomForest-SMOTE-Threshold0.3
Description                   : OneHot+Scaler+SMOTE+RF+Threshold=0.3
Accuracy                      : 0.7085
Precision                     : 0.3949
Recall                        : 0.7662
F1 Score                      : 0.5167

CSV Row Format:
RandomForest-SMOTE-Threshold0.3,OneHot+Scaler+SMOTE+RF+Threshold=0.3,0.7085,0.3949,0.7662,0.5167


# XGB

In [14]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categories
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: XGBoost Pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=np.sum(y == 0) / np.sum(y == 1),  # Handles class imbalance
        use_label_encoder=False,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 5: Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f},  Recall={rec:.4f}, F1={f1:.4f}")

# Step 6: Averages
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Model description
model_name = 'XGBoost-Balanced'
model_desc = 'OneHot+Scaler+5Fold-Stratified'

# Console summary
print("\n--- XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])

# Append to CSV, add header only if file doesn't exist
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fold 1: Accuracy=0.8414, Precision=0.6500,  Recall=0.4483, F1=0.5306
Fold 2: Accuracy=0.8759, Precision=0.7619,  Recall=0.5517, F1=0.6400
Fold 3: Accuracy=0.8138, Precision=0.5278,  Recall=0.6552, F1=0.5846
Fold 4: Accuracy=0.8414, Precision=0.6061,  Recall=0.6667, F1=0.6349
Fold 5: Accuracy=0.8958, Precision=0.7917,  Recall=0.6552, F1=0.7170

--- XGBoost Summary ---
Mean Accuracy : 0.8536
Mean Precision: 0.6675
Mean Recall   : 0.5954
Mean F1 Score : 0.6214

CSV Row Format:
XGBoost-Balanced,OneHot+Scaler+5Fold-Stratified,0.8536,0.6675,0.5954,0.6214


In [15]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import warnings
import csv

# Suppress warnings
warnings.filterwarnings("ignore")

# Step 1: Copy and split data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

# Step 2: Column categorization
categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 3: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 4: Pipeline with XGBoost + SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=4,  # 80:20 class balance
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    ))
])

# Step 5: Stratified 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 6: Metrics storage
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

# Threshold for classification
threshold = 0.25

# Step 7: CV loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

# Step 8: Average metrics
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Model info
model_name = 'XGBoost-SMOTE-Threshold0.25'
model_desc = 'OneHot+Scaler+SMOTE+XGB+Threshold=0.25'

# Step 9: Print metrics
print("\n--- Average Metrics Summary ---")
print(f"{'Name':<30}: {model_name}")
print(f"{'Description':<30}: {model_desc}")
print(f"{'Accuracy':<30}: {mean_acc:.4f}")
print(f"{'Precision':<30}: {mean_prec:.4f}")
print(f"{'Recall':<30}: {mean_rec:.4f}")
print(f"{'F1 Score':<30}: {mean_f1:.4f}")

# Also print as CSV row
print("\nCSV Row Format:")
print(f"{model_name},{model_desc},{mean_acc:.4f},{mean_prec:.4f},{mean_rec:.4f},{mean_f1:.4f}")

# Step 10: Write to file if not already present
output_path = 'risk_model_metrics.csv'
write_header = not os.path.exists(output_path)

with open(output_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    if write_header:
        writer.writerow(['Name', 'Desc', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    writer.writerow([model_name, model_desc, f"{mean_acc:.4f}", f"{mean_prec:.4f}", f"{mean_rec:.4f}", f"{mean_f1:.4f}"])


Fold 1: Accuracy=0.8690, Precision=0.6250, Recall=0.8621, F1=0.7246
Fold 2: Accuracy=0.8621, Precision=0.6452, Recall=0.6897, F1=0.6667
Fold 3: Accuracy=0.7586, Precision=0.4400, Recall=0.7586, F1=0.5570
Fold 4: Accuracy=0.8345, Precision=0.5750, Recall=0.7667, F1=0.6571
Fold 5: Accuracy=0.8125, Precision=0.5238, Recall=0.7586, F1=0.6197

--- Average Metrics Summary ---
Name                          : XGBoost-SMOTE-Threshold0.25
Description                   : OneHot+Scaler+SMOTE+XGB+Threshold=0.25
Accuracy                      : 0.8273
Precision                     : 0.5618
Recall                        : 0.7671
F1 Score                      : 0.6450

CSV Row Format:
XGBoost-SMOTE-Threshold0.25,OneHot+Scaler+SMOTE+XGB+Threshold=0.25,0.8273,0.5618,0.7671,0.6450


In [16]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Pipeline with SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        random_state=42
    ))
])

# Step 4: Hyperparameter Optimization
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

# Step 5: Dynamic Threshold Tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
thresholds = np.linspace(0.1, 0.9, 9)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    # Dynamic threshold tuning for recall
    recalls = [recall_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(recalls)]

    y_pred = (y_proba >= best_threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Best Threshold={best_threshold:.2f}")

# Step 6: Aggregate Results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Summary
model_name = 'XGBoost-SMOTE-FineTuned'
model_desc = f'OptimizedParams|DynamicThresh={best_threshold:.2f}|OneHot+Scaler'

print("\n--- Fine-Tuned XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.01, 'model__max_depth': 6, 'model__min_child_weight': 3, 'model__n_estimators': 300, 'model__subsample': 0.8}
Fold 1: Accuracy=0.6207, Precision=0.3415, Recall=0.9655, F1=0.5045, Best Threshold=0.10
Fold 2: Accuracy=0.6138, Precision=0.3333, Recall=0.9310, F1=0.4909, Best Threshold=0.10
Fold 3: Accuracy=0.5586, Precision=0.3034, Recall=0.9310, F1=0.4576, Best Threshold=0.10
Fold 4: Accuracy=0.5448, Precision=0.3125, Recall=1.0000, F1=0.4762, Best Threshold=0.10
Fold 5: Accuracy=0.5000, Precision=0.2828, Recall=0.9655, F1=0.4375, Best Threshold=0.10

--- Fine-Tuned XGBoost Summary ---
Mean Accuracy : 0.5676
Mean Precision: 0.3147
Mean Recall   : 0.9586
Mean F1 Score : 0.4733


In [18]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Step 1: Prepare data
current_df = df_clean.copy()
X = current_df.drop(columns=['Risk Flag', 'Predicted Risk Flag'])
y = current_df['Risk Flag'].astype(int)

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Step 2: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# Step 3: Pipeline with SMOTE
pipeline = ImbPipeline([
    ('prep', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        random_state=42
    ))
])

# Step 4: Hyperparameter Optimization
param_grid = {
    'model__n_estimators': [200, 300, 400],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__scale_pos_weight': [4, 5, 6]  # Adjusted for imbalance
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='balanced_accuracy',  # Balances true positive/negative rates
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

# Step 5: Dynamic Threshold Tuning
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
thresholds = np.linspace(0.1, 0.9, 9)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_train, y_train)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    # Dynamic threshold tuning for F1
    f1_scores = [f1_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(f1_scores)]

    y_pred = (y_proba >= best_threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

    print(f"Fold {fold}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}, Best Threshold={best_threshold:.2f}")

# Step 6: Aggregate Results
mean_acc = np.mean(accuracy_list)
mean_prec = np.mean(precision_list)
mean_rec = np.mean(recall_list)
mean_f1 = np.mean(f1_list)

# Step 7: Summary
model_name = 'XGBoost-SMOTE-FineTuned-Balanced'
model_desc = f'OptimizedParams|DynamicThresh={best_threshold:.2f}|OneHot+Scaler'

print("\n--- Fine-Tuned XGBoost Summary ---")
print(f"Mean Accuracy : {mean_acc:.4f}")
print(f"Mean Precision: {mean_prec:.4f}")
print(f"Mean Recall   : {mean_rec:.4f}")
print(f"Mean F1 Score : {mean_f1:.4f}")

# Step 8: Save to CSV
csv_file = "risk_model_metrics.csv"
result_df = pd.DataFrame([{
    'Name': model_name,
    'Desc': model_desc,
    'Accuracy': round(mean_acc, 4),
    'Precision': round(mean_prec, 4),
    'Recall': round(mean_rec, 4),
    'F1 Score': round(mean_f1, 4)
}])
result_df.to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))


Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 8, 'model__min_child_weight': 5, 'model__n_estimators': 300, 'model__scale_pos_weight': 4, 'model__subsample': 0.8}
Fold 1: Accuracy=0.9241, Precision=0.8462, Recall=0.7586, F1=0.8000, Best Threshold=0.60
Fold 2: Accuracy=0.8759, Precision=0.6897, Recall=0.6897, F1=0.6897, Best Threshold=0.40
Fold 3: Accuracy=0.8207, Precision=0.5405, Recall=0.6897, F1=0.6061, Best Threshold=0.60
Fold 4: Accuracy=0.8552, Precision=0.6452, Recall=0.6667, F1=0.6557, Best Threshold=0.70
Fold 5: Accuracy=0.8681, Precision=0.6786, Recall=0.6552, F1=0.6667, Best Threshold=0.70

--- Fine-Tuned XGBoost Summary ---
Mean Accuracy : 0.8688
Mean Precision: 0.6800
Mean Recall   : 0.6920
Mean F1 Score : 0.6836
