# Sensitivity Analysis for Alzheimer's

In [None]:
from SensitivityAnalysis import SensitivityAnalysis
import pandas as pd
import pickle
import dice_ml
from dice_ml.utils import helpers
from sklearn.model_selection import train_test_split

# Load Pre-trained Model
model = pickle.load(open('Alzheimer/Models/random_forest_classifier.sav', 'rb'))
# Load whole Alzheimer dataset
dataset = pd.read_csv("Alzheimer/Dataset/alzheimer_disease.csv")
# Select only the features that are used in the model
dataset = dataset[['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Diagnosis']]
# Load the affected dataset (possible values: cluster_1.csv, cluster_2.csv, cluster_3.csv, prediceted_target_1_instances.csv)
affected_dataset = pd.read_csv("Alzheimer/Dataset/cluster_2.csv") 

# For RL Framework
# Features to change and its types used in RL framework
features_to_change = ["FunctionalAssessment", "ADL", "MMSE"]
features_types = ["con", "con", "con"]
mins = [0, 0, 0] # Minimum values for the features to change
maxs = [10, 10, 30] # Maximum values for the features to change

# Counterfactuals generated by the RL framework
counterfactuals = [[2.049, 2.846, 8.077], [2.734, 1.763, 9.127], [0.299, 3.581, 9.525], [2.157, 2.511, 21.475]]
desired_label = 0


# For DiCE
# Alzheimer's Features types to be used in DiCE
dataset_features_types = ["con", "cat", "cat", "ord", "con", "con", "con", "con", "con", "con", "con", "con", "con", "con", "con", "con", "con", "cat", "cat", "con"]
outcome = "Diagnosis"

# For Nice
X = dataset.drop('Diagnosis', axis=1)
y = dataset['Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.20, random_state=40)
predict_fn = lambda x: model.predict_proba(x)

# True: if you want to generate CFs using DiCE and Nice and calculate its gower distance
# False: if you want to find Gower distance for the RL generated CFs
dice_nice = True 

# Sensitivity Analysis
sa = SensitivityAnalysis(dataset, affected_dataset, features_to_change, features_types, dataset_features_types, mins, maxs, model, desired_label, dice_nice, model_type='sklearn')
sa.set_dice_data(outcome)
sa.set_nice_data(X_train, y_train, predict_fn)

# If you want to find best CF (RL Framework) for each individual and its gower distance
# output_csv_path = "SensitivityAnalysis/Alzheimer/FairnessMetric/cluster_0.csv" , instead of FairnessMetric use (EF-Micro, EF-Macro, ECR or EF-ECR)
output_csv_path = "SensitivityAnalysis/Alzheimer/DiceNice/cluster_2.csv"

sa.find_best_cf_for_individuals(affected_dataset, counterfactuals, output_csv_path)

# Sensitivity Analysis for Adult

In [None]:
from SensitivityAnalysis import SensitivityAnalysis
import pandas as pd
import numpy as np
import pickle
import dice_ml
from dice_ml.utils import helpers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


# Load Pre-trained Mode
model = pickle.load(open('Adult/Models/random_forest_classifier.sav', 'rb'))
# Load whole Adult dataset
dataset = pd.read_csv("Adult/Dataset/adult.csv")
# Load the affected dataset (possible values: cluster_1.csv, cluster_2.csv, cluster_3.csv, prediceted_target_0_instances.csv)
affected_dataset = pd.read_csv("Adult/Dataset/cluster_2.csv")

# Preprocess the dataset
dataset = dataset.replace('?', np.nan)
dataset.dropna(how='any', inplace=True)
dataset.drop_duplicates(inplace=True)
dataset['income']= dataset['income'].replace({'<=50K':0, '>50K':1})
dataset['race'] = dataset['race'].apply(lambda x: 0 if x == 'White' else 1)
dataset['gender'] = dataset['gender'].apply(lambda x: 0 if x == 'Male' else 1)
dataset.rename(columns={'gender': 'sex'}, inplace=True)
label_encoder = LabelEncoder()
dataset['workclass'] = label_encoder.fit_transform(dataset['workclass'])
dataset['education'] = label_encoder.fit_transform(dataset['education'])
dataset['marital-status'] = label_encoder.fit_transform(dataset['marital-status'])
dataset['occupation'] = label_encoder.fit_transform(dataset['occupation'])
dataset['relationship'] = label_encoder.fit_transform(dataset['relationship'])
dataset['native-country'] = label_encoder.fit_transform(dataset['native-country'])

# For RL Framework
# Features to change and its types used in RL framework
features_to_change = ["capital-gain", "hours-per-week", "educational-num"]
features_types = ["con", "con", "ord"]
mins = [0, 1, 1] # Minimum values for the features to change
maxs = [99999, 99, 16] # Maximum values for the features to change

# Counterfactuals generated by the RL framework
counterfactuals = [[9675, 5, 3], [10816, 0, 1], [8173, 10, 4]]
desired_label = 1

# For DiCE
# Adult Features types to be used in DiCE
dataset_features_types = ["con", "cat", "con", "ord", "ord", "cat", "cat", "cat", "cat", "cat", "con", "con", "con", "cat"]
outcome = "income"

# For Nice
X = dataset.drop('income', axis=1)
y = dataset['income']
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.20, random_state=40, stratify=y)
predict_fn = lambda x: model.predict_proba(x)

# True: if you want to generate CFs using DiCE and Nice and calculate its gower distance
# False: if you want to find Gower distance for the RL generated CFs
dice_nice = True

# Sensitivity Analysis
sa = SensitivityAnalysis(dataset, affected_dataset, features_to_change, features_types, mins, maxs, model, desired_label, dice_nice, model_type='sklearn')
sa.set_dice_data(dataset_features_types, outcome)
sa.set_nice_data(X_train, y_train, predict_fn, dataset_features_types)

# If you want to find best CF (RL Framework) for each individual and its gower distance
# output_csv_path = "SensitivityAnalysis/Adult/FairnessMetric/cluster_0.csv" , instead of FairnessMetric use (EF-Micro, EF-Macro, ECR or EF-ECR)
output_csv_path = "SensitivityAnalysis/Adult/DiceNice/cluster_2.csv"

sa.find_best_cf_for_individuals(affected_dataset, counterfactuals, output_csv_path)

# Find Mean Gower 

In [None]:
# Find the mean of a column in a dataset across all rows

import pandas as pd
import numpy as np

# Load the dataset from Alzheimer's or Adult
data = pd.read_csv('SensitivityAnalysis/Adult/DiceNice/cluster_2.csv')
tmp = "DiCE Gower" # "NICE CF Gower" / "DiCE CF Gower" / "GFCF Gower"

# Ensure "Gower" column exists
if tmp in data.columns:
    # Convert the column to numeric, coercing errors to NaN (handles non-numeric values)
    data[tmp] = pd.to_numeric(data[tmp], errors="coerce")

    # Remove rows where "DiCE CF Gower" is NaN or infinity
    filtered_data = data[(~data[tmp].isna()) & (~data[tmp].isin([np.inf, -np.inf]))]

    print(filtered_data[tmp])
    print(filtered_data.shape[0])

    # Calculate the mean of the "DiCE CF Gower" column
    mean_value = filtered_data[tmp].mean()
    print(f"The mean of '{tmp}' is: {mean_value}")
else:
    print(f"The column '{tmp}' does not exist in the dataset.")