In [None]:
from simplified_fraud_detection import SimpleFraudDetector
import shap
import pandas as pd
import numpy as np
import json

detector = SimpleFraudDetector()
detector.load_model('simple_fraud_detection_model.pkl')

claims = pd.read_csv('enhanced_medicare_data_claims.csv')
patients = pd.read_csv('enhanced_medicare_data_patients.csv')
providers = pd.read_csv('enhanced_medicare_data_providers.csv')

claims_p, providers_p, patients_p = detector.preprocess_data(claims, providers, patients)
X, y = detector.prepare_features(claims_p, providers_p, patients_p)

claim_index = 0
X_sample = X[claim_index:claim_index+1]

claim_id = claims_p.iloc[claim_index].get('claim_id', f'CLM{claim_index+1}')
provider_id = claims_p.iloc[claim_index].get('provider_id', f'PRV{claim_index+1}')

fraud_score = float(detector.models['xgb'].predict_proba(X_sample)[0][1])
predicted_label = "FRAUD" if fraud_score >= 0.5 else "NON-FRAUD"

xgb_model = detector.models['xgb']
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_sample)[0]

feature_names = [
    "claim_amount", "length_of_stay", "patient_age", "claim_duration",
    "procedure_code_encoded", "diagnosis_code_encoded", "gender_encoded",
    "comorbidity_count", "fraud_tendency", "peer_avg_amount",
    "peer_std_amount", "peer_avg_los", "peer_total_claims"
]

feature_contributions = {}
for name, shap_val in zip(feature_names, shap_values):
    if "amount" in name:
        condition = f"{name} <= 1000.00"
    elif "stay" in name:
        condition = f"{name} <= 5.00"
    elif "age" in name:
        condition = f"{name} <= 65"
    else:
        condition = f"{name}"
    
    feature_contributions[condition] = round(float(shap_val), 4)

metrics = {
    "tp": 0,
    "fp": 0,
    "fn": 1,
    "avg_claim_value": float(X_sample[0][0]) if X_sample.shape[1] > 0 else 0.0,
    "personnel_cost": 5000,
    "infra_cost": 1500,
    "compliance_cost": 800
}
explanation_json = {
    "claim_id": claim_id,
    "provider_id": provider_id,
    "fraud_score": fraud_score,
    "predicted_label": predicted_label,
    "feature_contributions": feature_contributions,
    "metrics": metrics
}
 
with open("fraud_explanation.json", "w") as f:
    json.dump(explanation_json, f, indent=4)

print("✅ Explanation saved to fraud_explanation.json")
print(json.dumps(explanation_json, indent=4))


Model loaded from simple_fraud_detection_model.pkl
✅ Explanation saved to fraud_explanation.json
{
    "claim_id": "CLM0000000001",
    "provider_id": "PRV001253",
    "fraud_score": 0.1914990246295929,
    "predicted_label": "NON-FRAUD",
    "feature_contributions": {
        "claim_amount <= 1000.00": -1.4445,
        "length_of_stay <= 5.00": 0.3402,
        "patient_age <= 65": 0.1063,
        "claim_duration": 0.0,
        "procedure_code_encoded": 0.4216,
        "diagnosis_code_encoded": 0.3477,
        "gender_encoded": 0.0057,
        "comorbidity_count": 0.0123,
        "fraud_tendency": 0.0561,
        "peer_avg_amount <= 1000.00": -0.0089,
        "peer_std_amount <= 1000.00": 0.0004,
        "peer_avg_los": -0.0801,
        "peer_total_claims": 0.0
    },
    "metrics": {
        "tp": 0,
        "fp": 0,
        "fn": 1,
        "avg_claim_value": -0.46298859948090226,
        "personnel_cost": 5000,
        "infra_cost": 1500,
        "compliance_cost": 800
    }
}
