In [1]:
!pip uninstall tensorflow -y

Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0


In [2]:
pip install tensorflow shap lime dice-ml scikit-learn pandas

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dice-ml
  Downloading dice_ml-0.12-py3-none-any.whl.metadata (20 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting raiutils>=0.4.0 (from dice-ml)
  Downloading raiutils-0.4.2-py3-none-any.whl.metadata (1.4 kB)
Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (620.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dice_ml-0.12-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install tensorflow



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
import tensorflow.keras as tf
import shap
import lime
import lime.lime_tabular
import dice_ml
from dice_ml import Dice

# Load the dataset
file_path = '/content/diabetes.csv'
data = pd.read_csv(file_path)
data.dropna(inplace=True)
# Preprocessing: Encode categorical variables
categorical_columns = ['metformin','repaglinide','nateglinide','chlorpropamide','glimepiride','acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide','citoglipton','glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone','insulin','District','type', 'group', 'gender', 'smoking_history', 'diabetesMed']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Scaling numerical features
scaler = StandardScaler()
numerical_columns = ['age', 'BMI', 'HbA1c_level', 'blood_glucose_level']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Splitting data into features and target
X = data.drop('diabetes', axis=1)
y = data['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Selection using ANOVA F-test
selector = SelectKBest(score_func=f_classif, k='all')
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Building the Deep Learning model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_selected.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(X_train_selected, y_train, epochs=20, batch_size=32, validation_data=(X_test_selected, y_test))




KeyError: 'metformin'

In [None]:
!pip cache purge


[0mFiles removed: 0


In [None]:
import numpy as np
# SHAP Explanation
# Select a random sample of the background data for SHAP
background_sample_size = 1000  # Choose a smaller sample size (e.g., 1000)
background_data = X_train_selected[np.random.choice(X_train_selected.shape[0], background_sample_size, replace=False)]

# SHAP Explanation
explainer_shap = shap.DeepExplainer(model, background_data)
shap_values = explainer_shap.shap_values(X_test_selected)

# Visualization
shap.summary_plot(shap_values, X_test_selected)



In [None]:
# LIME Explanation
explainer_lime = lime.lime_tabular.LimeTabularExplainer(X_train_selected, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'], discretize_continuous=True)

# Since the model output is a probability, you need to use the predict function that returns probabilities.
# Modify the prediction function for LIME to work with binary classification
def predict_proba_fn(x):
    return np.array([1 - model.predict(x).flatten(), model.predict(x).flatten()]).T

# Explain the first instance in the test set
lime_exp = explainer_lime.explain_instance(X_test_selected[0], predict_proba_fn, num_features=10)

# Show the explanation
lime_exp.show_in_notebook(show_all=False)



In [None]:
# Step 1: Identify categorical columns in the training data
categorical_columns = ['metformin','repaglinide','nateglinide','chlorpropamide','glimepiride','acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide','citoglipton','glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone','insulin','District','type', 'group', 'gender', 'smoking_history', 'diabetesMed']
 # Add any other categorical columns that may cause issues

# Step 2: Validate and adjust each categorical column in query_instance
for col in categorical_columns:
    if col in query_instance.columns:
        valid_values = dice_data.data_df[col].unique()  # Valid values from the training data
        query_value = query_instance[col].iloc[0]  # Value in query_instance

        print(f"Valid '{col}' values from training data:", valid_values)
        print(f"Value in query_instance '{col}':", query_value)

        # Adjust the value if it's not valid
        if query_value not in valid_values:
            print(f"Adjusting '{col}' value in query_instance from {query_value} to {valid_values[0]}")
            query_instance[col] = valid_values[0]  # Replace with a valid value
        else:
            print(f"'{col}' value in query_instance is valid.")
    else:
        print(f"Column '{col}' not found in query_instance.")

# Step 3: Generate counterfactuals with the adjusted query instance
try:
    dice_exp = dice.generate_counterfactuals(query_instance, total_CFs=4, desired_class="opposite")
    dice_exp.visualize_as_dataframe()
except ValueError as e:
    print("Encountered an error with DiCE:", e)


In [None]:
# Results for SHAP, LIME, DiCE
shap.summary_plot(shap_values, X_test_selected)
lime_exp.show_in_notebook(show_all=False)
dice_exp.visualize_as_dataframe()

NameError: name 'shap' is not defined