In [72]:
import joblib
import pandas as pd
import shap
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [73]:
model = joblib.load('final_model.pkl')

# Load the scaler
scaler = joblib.load('scaler.pkl')

# Load the SHAP input data
x_shap = pd.read_csv('x_shap.csv')
y_shap = pd.read_csv('y_shap.csv')

In [74]:
imputer = SimpleImputer(strategy='mean')  # You must fit this on *training* data ideally
x_shap_imputed = imputer.fit_transform(x_shap) 

In [75]:
x_shap_imputed

array([[ 1.47787198e-03, -7.18765530e-03, -3.78700375e-03, ...,
        -7.15389188e-04, -8.98904354e-05,  9.80118981e-03],
       [-1.05163557e-02, -8.87020876e-01,  1.15267119e+00, ...,
         3.16350111e-01,  1.39320746e+00, -2.44014145e-01],
       [ 6.44316517e-01,  1.12736918e+00,  1.45530343e+00, ...,
        -4.86592410e-01, -4.78618763e-01, -2.44014145e-01],
       ...,
       [-3.37932792e-01,  1.12736918e+00, -9.65754504e-01, ...,
        -4.86592410e-01, -1.41453187e+00, -2.44014145e-01],
       [-3.37932792e-01,  1.12736918e+00, -9.65754504e-01, ...,
         8.69379621e-02, -4.78618763e-01, -2.44014145e-01],
       [ 1.47787198e-03, -7.18765530e-03, -3.78700375e-03, ...,
        -7.15389188e-04, -8.98904354e-05,  9.80118981e-03]],
      shape=(56698, 14))

In [76]:
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_shap)

x_train_scaled = scaler.fit_transform(x_train_imputed)
background = x_train_scaled[:100]

explainer = shap.KernelExplainer(model.decision_function, background)

In [77]:
def explain_single_prediction(patient_features, feature_names, scaler, imputer, explainer, return_raw=False):
    """
    Explain a single prediction using SHAP for an SVM model with precomputed scaler, imputer, and explainer.
    """
    # Impute and scale patient's input
    patient_imputed = imputer.transform([patient_features])
    patient_scaled = scaler.transform(patient_imputed)

    # Get SHAP values
    shap_values = explainer.shap_values(patient_scaled)
    shap_values = shap_values[0]  # binary classification

    # Pair features with SHAP values
    feature_impacts = list(zip(feature_names, shap_values))
    feature_impacts_sorted = sorted(feature_impacts, key=lambda x: abs(x[1]), reverse=True)

    if return_raw:
        return shap_values

    explanations = []
    for feature, impact in feature_impacts_sorted:
        if impact > 0:
            explanations.append(f"The feature **{feature}** pushes towards the patient being diabetic (+{impact:.2f}).")
        else:
            explanations.append(f"The feature **{feature}** pushes towards the patient NOT being diabetic ({impact:.2f}).")

    
    return explanations



In [78]:
def predict_diabetes(model, scaler, HighBP, HighChol, HeartDiseaseorAttack, Stroke, Smoker, PhysActivity, DiffWalk, BMI, MentHlth, PhysHlth, GenHlth, Sex, Age, HvyAlcoholConsump):
    new_patient = [[HighBP, HighChol, HeartDiseaseorAttack, Stroke, Smoker, PhysActivity, DiffWalk, BMI, MentHlth, PhysHlth, GenHlth, Sex, Age, HvyAlcoholConsump]]
    new_patient_scaled = scaler.transform(new_patient)
    prediction = model.predict(new_patient_scaled)
    
    if prediction[0] == 1:
        print("⚠️  The patient is likely diabetic.")
    else:
        print("✅  The patient is likely not diabetic.")


In [79]:
feature_names = [
    "HighBP", "HighChol", "HeartDiseaseorAttack", "Stroke", "Smoker", "PhysActivity", "DiffWalk", "BMI", "MentHlth", "PhysHlth", "GenHlth", "Sex", "Age", "HvyAlcoholConsump"]


patient_input = [1,1,1,1,1,1,1,34,0,7,4,0,9,0]

explanations = explain_single_prediction(
    scaler=scaler,
    patient_features=patient_input,
    feature_names=feature_names,
    imputer=imputer,
    explainer=explainer
)

for sentence in explanations:
    print("🤖", sentence)


predict_diabetes(model,scaler,1,1,1,1,1,1,1,34,0,7,4,0,9,0)



  0%|          | 0/1 [00:00<?, ?it/s]

🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **MentHlth** pushes towards the 

# Testing Area

In [87]:
def detect_unexpected_shap_behavior(feature_names, shap_values, patient_features):
    expected_directions = {
        "HighBP": "positive",
        "HighChol": "positive",
        "HeartDiseaseorAttack": "positive",
        "Stroke": "positive",
        "Smoker": "positive",
        "PhysActivity": "negative",
        "DiffWalk": "positive",
        "BMI": "positive",
        "MentHlth": "positive",
        "PhysHlth": "positive",
        "GenHlth": "negative",  # 1 = excellent
        "Age": "positive",
        "HvyAlcoholConsump": "positive"
        # "Sex" is omitted as it's neutral/uncertain
    }

    issues = []

    for feature, shap_val, patient_feature in zip(feature_names, shap_values, patient_features):
        expected = expected_directions.get(feature)
        if expected:
            if expected == "positive" and patient_feature == 1  and shap_val < 0:
                issues.append(f"⚠️ Unexpected: {feature} reduced risk (SHAP={shap_val:.3f})")
            elif expected == "negative" and patient_feature==0 and shap_val > 0:
                issues.append(f"⚠️ Unexpected: {feature} increased risk (SHAP={shap_val:.3f})")

    return issues


In [83]:

df = pd.read_csv("cdc_diabetes_health_indicators.csv")
positive_df = df[df["Diabetes_binary"] == 1]


In [94]:
patient_input = [1,1,1,1,1,1,1,34,0,7,4,0,9,0]

raw = explain_single_prediction(
    scaler=scaler,
    patient_features=patient_input,
    feature_names=feature_names,
    imputer=imputer,
    explainer=explainer,
    return_raw=True
)


explanation = explain_single_prediction(
    scaler=scaler,
    patient_features=patient_input,
    feature_names=feature_names,
    imputer=imputer,
    explainer=explainer
)

print(raw)
print(explanation)



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

[ 0.18608228  0.04377382  0.2104382   0.16244736  0.          0.
  0.11524008  0.99783193  0.          0.17881727 -0.07133014  0.
  2.68755998  0.01240137]
['The feature **Age** pushes towards the patient being diabetic (+2.69).', 'The feature **BMI** pushes towards the patient being diabetic (+1.00).', 'The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).', 'The feature **HighBP** pushes towards the patient being diabetic (+0.19).', 'The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).', 'The feature **Stroke** pushes towards the patient being diabetic (+0.16).', 'The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).', 'The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).', 'The feature **HighChol** pushes towards the patient being diabetic (+0.04).', 'The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).', 'The feature **Smoker** pushes towards the 

In [99]:
for feature, shap_val, patient_feature in zip(feature_names, raw, patient_input):
    print(feature, shap_val, patient_feature)

HighBP 0.18608227840055258 1
HighChol 0.04377381966310882 1
HeartDiseaseorAttack 0.2104381959196054 1
Stroke 0.1624473619231359 1
Smoker 0.0 1
PhysActivity 0.0 1
DiffWalk 0.11524008073487943 1
BMI 0.9978319341352838 34
MentHlth 0.0 0
PhysHlth 0.17881726817044707 7
GenHlth -0.07133013641616692 4
Sex 0.0 0
Age 2.6875599814052653 9
HvyAlcoholConsump 0.012401366662901868 0


In [85]:
positive_df.shape

(35346, 22)

In [91]:
first_test=positive_df[:10]

In [92]:
first_test.shape

(10, 22)

In [93]:
# Loop through each positive diabetic sample
for index, row in first_test.iterrows():
    patient_input = row[feature_names].values.tolist()

    # Get explanation and extract raw SHAP values
    explanation = explain_single_prediction(
        scaler=scaler,
        patient_features=patient_input,
        feature_names=feature_names,
        imputer=imputer,
        explainer=explainer
    )
    
    # Extract raw SHAP values from the explanation
    raw_shap_values = explain_single_prediction(
        scaler=scaler,
        patient_features=patient_input,
        feature_names=feature_names,
        imputer=imputer,
        explainer=explainer,
        return_raw=True
    ) # Assuming explanations contain shap_values as a key

    # Now use detect_unexpected_shap_behavior with return_raw=True
    raw_output = detect_unexpected_shap_behavior(
        shap_values=raw_shap_values,  # Pass the raw SHAP values here
        feature_names=feature_names,
        patient_features=patient_input
    )

    print(f"\nPatient Index: {index}")
    print("Explanations:")
    for sentence in explanations:
        print("🤖", sentence)




  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 8
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature 



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 10
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 13
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 17
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 23
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 26
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 27
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 28
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 30
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]


Patient Index: 34
Explanations:
🤖 The feature **Age** pushes towards the patient being diabetic (+2.69).
🤖 The feature **BMI** pushes towards the patient being diabetic (+1.00).
🤖 The feature **HeartDiseaseorAttack** pushes towards the patient being diabetic (+0.21).
🤖 The feature **HighBP** pushes towards the patient being diabetic (+0.19).
🤖 The feature **PhysHlth** pushes towards the patient being diabetic (+0.18).
🤖 The feature **Stroke** pushes towards the patient being diabetic (+0.16).
🤖 The feature **DiffWalk** pushes towards the patient being diabetic (+0.12).
🤖 The feature **GenHlth** pushes towards the patient NOT being diabetic (-0.07).
🤖 The feature **HighChol** pushes towards the patient being diabetic (+0.04).
🤖 The feature **HvyAlcoholConsump** pushes towards the patient being diabetic (+0.01).
🤖 The feature **Smoker** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature **PhysActivity** pushes towards the patient NOT being diabetic (0.00).
🤖 The feature