In [1]:
import pandas as pd
import numpy as np

import os
from itertools import combinations

from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from preprocessing_pipeline import PreprocessingPipeline
from explanation_collection import explanation_collection
from differential_privacy import DifferentialPrivacy


In [2]:
dataset_id = int(os.environ.get('DATASET_ID'))
numerical_columns = os.environ.get('NUMERICAL_COLUMNS')
numerical_columns = [item.strip() for item in numerical_columns.split(',')]
case_id = int(os.environ.get('CASE_ID'))
epsilon = int(os.environ.get('EPSILON'))
rep = int(os.environ.get('REPEAT'))

path = "./results_Exp"

# dataset_id = 891
# numerical_columns = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
# case_id = 0
# epsilon = 0


In [3]:
# fetch dataset 
uci_dataset = fetch_ucirepo(id=dataset_id)
  
# data (as pandas dataframes) 
X = uci_dataset.data.features 
y = uci_dataset.data.targets 

data = pd.concat([X, y], axis=1)

target_col = y.columns[0]

# Select the remaining columns as categorical columns
all_columns = set(X.columns)
categorical_columns = list(all_columns - set(numerical_columns))

pipeline = PreprocessingPipeline()

In [4]:
steps_prior = [
    'drop_missing_values',

    'encode_categorical_variables',
]

X, y = pipeline.preprocess(data, target_col, numerical_columns, categorical_columns, steps_prior)

if dataset_id == 2:
    y = np.where((y == 0) | (y == 1), 0, 1)

Missing Values Dropped: 0 rows removed
Percentage of rows dropped due to missing values: 0.00%
number of categorical columns '14


In [5]:
# Split data into training and test sets
X_o, X_q, y_o, y_q = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_o to a pandas Series
y_o_series = pd.Series(y_o, name=target_col)
y_q_series = pd.Series(y_q, name=target_col)

# Reset index of X_o
X_o = X_o.reset_index(drop=True)
X_q = X_q.reset_index(drop=True)

# Combine X and y into a single DataFrame
df = pd.concat([X_o, y_o_series], axis=1)
    
df_q = pd.concat([X_q, y_q_series], axis=1)

In [6]:
dp = DifferentialPrivacy(epsilon)

df_r = df.copy()

if epsilon != 0:
    df_v = dp.apply_differential_privacy(df, numerical_columns, categorical_columns, round_to_int=True)
else:
    df_v = df.copy()

In [7]:
def get_combination(case_id):
    elements = [1, 2, 3, 4]
    all_combinations = []

    # Generate all non-empty combinations of the elements
    for r in range(4, 0, -1):
        all_combinations.extend(combinations(elements, r))

    # Map the case_id to the corresponding combination
    if 0 <= case_id < len(all_combinations):
        return list(all_combinations[case_id])
    else:
        return "Invalid case_id"

In [8]:
steps = ['drop_all_duplicates', 'handle_outliers', 'scale_numerical_features_only', 'resample_data']

steps_in_r = get_combination(case_id)
steps_in_v = get_combination(case_id)

steps_r = [steps[i-1] for i in steps_in_r]
steps_v = [steps[i-1] for i in steps_in_v]

steps_qr = None
steps_qv = None

if 3 in steps_in_r:
    steps_qr = [steps[2]]
if 3 in steps_in_v:
    steps_qv = [steps[2]]

In [9]:
# Apply the preprocessing pipeline
if steps_r:
    X_r, y_r = pipeline.preprocess(df_r, target_col, numerical_columns, categorical_columns, steps_r, train=True)
else:
    X_r, y_r = df.iloc[:,0:-1], df.iloc[:, -1]

if steps_v:
    X_v, y_v = pipeline.preprocess(df_v, target_col, numerical_columns, categorical_columns, steps_v, train=True)
else:
    X_v, y_v = df.iloc[:,0:-1], df.iloc[:, -1]

# For querying
if steps_qr:
    X_qr, y_qr = pipeline.preprocess(df_q, target_col, numerical_columns, categorical_columns, steps_qr, train=False, X_train=X_r)
else:
    X_qr, y_qr = df_q.iloc[:, 0:-1], df_q.iloc[:, -1]

if steps_qv:
    X_qv, y_qv = pipeline.preprocess(df_q, target_col, numerical_columns, categorical_columns, steps_qv, train=False, X_train=X_v)
else:
    X_qv, y_qv = df_q.iloc[:, 0:-1], df_q.iloc[:, -1]

Duplicates Dropped: 17359 rows removed
Percentage of rows dropped due to duplicates: 8.55%
Outliers: 15233 rows removed
Percentage of rows dropped due to outliers: 8.21%
number of numerical columns '7
Mean relative change for each feature:
 BMI          0.970887
GenHlth      0.780531
MentHlth     0.513143
PhysHlth     0.599856
Age          0.935646
Education    0.858547
Income       0.911513
dtype: float64
Overall mean relative change: 0.7957321294793795
Percentage change in row count due to resampling: 71.44%
Duplicates Dropped: 17359 rows removed
Percentage of rows dropped due to duplicates: 8.55%
Outliers: 15233 rows removed
Percentage of rows dropped due to outliers: 8.21%
number of numerical columns '7
Mean relative change for each feature:
 BMI          0.970887
GenHlth      0.780531
MentHlth     0.513143
PhysHlth     0.599856
Age          0.935646
Education    0.858547
Income       0.911513
dtype: float64
Overall mean relative change: 0.7957321294793795
Percentage change in row 

In [10]:
# Resercher Side
# Train a Logistic Regression classifier
model_r = LogisticRegression(max_iter=1000, random_state=42)
model_r.fit(X_r, y_r)

# Predict and evaluate
y_pred_r = model_r.predict(X_qr)

print("Logistic Regression Without Preprocessing")
print("Accuracy:", accuracy_score(y_qr, y_pred_r))

Logistic Regression Without Preprocessing
Accuracy: 0.13790996531062757


In [11]:
# Verifier Side
# Train a Logistic Regression classifier
model_v = LogisticRegression(max_iter=1000, random_state=42)
model_v.fit(X_v, y_v)

# Make predictions
y_pred_v = model_v.predict(X_qv)

print("Logistic Regression With Preprocessing")
print("Accuracy:", accuracy_score(y_qv, y_pred_v))

Logistic Regression With Preprocessing
Accuracy: 0.13790996531062757


In [12]:
exp_collect = explanation_collection()

explanations_r = exp_collect.get_explanations(X_r, X_qr, model_r)
df_explanations_r = pd.DataFrame(explanations_r)

  1%|▍                                      | 500/50736 [00:10<17:21, 48.24it/s]


In [13]:
explanations_v = exp_collect.get_explanations(X_v, X_qv, model_v)
df_explanations_v = pd.DataFrame(explanations_v)

  1%|▍                                      | 500/50736 [00:10<17:19, 48.33it/s]


In [14]:
df_explanations_r['case'] = case_id
df_explanations_r.to_csv(f'{path}/dataset{dataset_id}_eps{epsilon}_resR_case{case_id}_rep{rep}.csv', index=False)

df_explanations_v['case'] = case_id
df_explanations_v.to_csv(f'{path}/dataset{dataset_id}_eps{epsilon}_resV_case{case_id}_rep{rep}.csv', index=False)

NameError: name 'path' is not defined