In [90]:
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

In [91]:

data=pd.read_csv("../data/processed/data_features_selected.csv")
X = data.drop('AR', axis = 1)
y = data['AR']

## Load Models

In [92]:
# Define the models and their filenames
model_paths = [
    ('RandomForest', 'random_forest_model.pkl'),
    ('XGBoost', 'xgboost_model.pkl'),
    ('Logistic Regression', 'logistic_regression_model.pkl'),
    ('Ensemble', 'ensemble_model.pkl')
]

# Path to the models directory
models_path = '../models/'

# Dictionary to hold loaded models
loaded_models = {}

# Load each model
for model_name, filename in model_paths:
    file_path = os.path.join(models_path, filename)
    
    if os.path.exists(file_path):
        with open(file_path, 'rb') as file:
            loaded_models[model_name] = pickle.load(file)
        print(f"{model_name} model loaded successfully.")
    else:
        print(f"Model file for {model_name} not found at {file_path}.")

rf_model = loaded_models.get('RandomForest')
xgb_model = loaded_models.get('XGBoost')
log_model = loaded_models.get('Logistic Regression')
ensemble_model = loaded_models.get('Ensemble')

RandomForest model loaded successfully.
XGBoost model loaded successfully.
Logistic Regression model loaded successfully.
Ensemble model loaded successfully.


In [93]:
models = [
    ('Random Forest', rf_model),
    ('XGBoost', xgb_model),
    ('Logistic Regression', log_model),
    ('Ensemble', ensemble_model)
]

## Threshold Optimization and Model Performance Evaluation on Full Dataset

In [94]:

def model_threshold_calculation(model,model_name,thresholds,X,y):

    # Define costs
    false_negative_cost = 17
    false_positive_cost = 14

    costs = []
    fn_costs = []
    fn_count = []
    fp_costs = []
    fp_count = []
    auto_reject = []
    auto_approve = []
    confusion_matrices = []
    threshold_arr=[]
    y_prob = model.predict_proba(X)[:, 1]
    cost_df=pd.DataFrame()

    
    for threshold in thresholds:
        # Apply the threshold to classify
        y_pred = (y_prob >= threshold).astype(int)
        
        # Calculate the confusion matrix
        cm = confusion_matrix(y, y_pred)
        
        # Extract confusion matrix components
        tn, fp, fn, tp = cm.ravel()
        
        # Calculate costs
        fn_cost = fn * false_negative_cost
        fp_cost = fp * false_positive_cost
        total_cost = fn_cost + fp_cost
        
        # Store results
        costs.append(total_cost)
        fn_costs.append(fn_cost)
        fp_costs.append(fp_cost)
        fp_count.append(fp)
        fn_count.append(fn)
        threshold_arr.append(threshold)
        auto_approve.append(np.sum(y_prob < threshold))
        auto_reject.append(np.sum(y_prob >= threshold))
        confusion_matrices.append(cm)


    cost_df = pd.DataFrame({
        'Model Name':model_name,
        'Threshold': threshold_arr,
        'Cost': costs,
        'False Negative Count': fn_count,
        'False Negative Cost': fn_costs,
        'False Positive Count': fp_count,
        'False Positive Cost': fp_costs,
        'Auto Reject': auto_reject,
        'Auto Approve': auto_approve
    })

   

    cost_df.sort_values(by='Cost', ascending=True,inplace=True)
    

    return cost_df.iloc[0]



In [98]:
best_threshold_per_model = []
# Define threshold range and step size
start_threshold = 0.1
end_threshold = 0.9
step_size = 0.01

thresholds = np.arange(start_threshold, end_threshold + step_size, step_size)

# Loop through models and calculate best thresholds
for model_name, model in models:
    best_threshold = model_threshold_calculation(model, model_name,thresholds,X,y)
    # best_threshold = best_threshold.sort_values(by='Cost', ascending=True)
    # print(best_threshold.iloc[60])
    best_threshold_per_model.append(best_threshold)

model_optimum_thresholds = pd.DataFrame(best_threshold_per_model)

In [102]:
model_optimum_thresholds

Unnamed: 0,Model Name,Threshold,Cost,False Negative Count,False Negative Cost,False Positive Count,False Positive Cost,Auto Reject,Auto Approve
40,Random Forest,0.5,24835,933,15861,641,8974,3103,6795
41,XGBoost,0.51,37337,1465,24905,888,12432,2818,7080
41,Logistic Regression,0.51,43367,1711,29087,1020,14280,2704,7194
40,Ensemble,0.5,32892,1304,22168,766,10724,2857,7041


In [101]:
model_optimum_thresholds.to_csv("../data/processed/model_optimum_thresholds.csv",index=False)