In [1]:
import pandas as pd
import numpy as np

import os
from itertools import combinations

from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_distances


from preprocessing_pipeline import PreprocessingPipeline
from explanation_collection import explanation_collection
from differential_privacy import DifferentialPrivacy


KeyboardInterrupt: 

In [None]:
dataset_id = int(os.environ.get('DATASET_ID'))
numerical_columns = os.environ.get('NUMERICAL_COLUMNS')
numerical_columns = [item.strip() for item in numerical_columns.split(',')]
case_id = int(os.environ.get('CASE_ID'))
epsilon = int(os.environ.get('EPSILON'))
rep = int(os.environ.get('REPEAT'))

path = "./results_CosDist"

# dataset_id = 891
# numerical_columns = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
# case_id = 0
# epsilon = 0


In [None]:
# fetch dataset 
uci_dataset = fetch_ucirepo(id=dataset_id)
  
# data (as pandas dataframes) 
X = uci_dataset.data.features 
y = uci_dataset.data.targets 

data = pd.concat([X, y], axis=1)

target_col = y.columns[0]

# Select the remaining columns as categorical columns
all_columns = set(X.columns)
categorical_columns = list(all_columns - set(numerical_columns))

pipeline = PreprocessingPipeline()

In [None]:
steps_prior = [
    'drop_missing_values',

    'encode_categorical_variables',
]

X, y = pipeline.preprocess(data, target_col, numerical_columns, categorical_columns, steps_prior)

if dataset_id == 2:
    y = np.where((y == 0) | (y == 1), 0, 1)

In [None]:
# Split data into training and test sets
X_o, X_q, y_o, y_q = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_o to a pandas Series
y_o_series = pd.Series(y_o, name=target_col)
y_q_series = pd.Series(y_q, name=target_col)

# Reset index of X_o
X_o = X_o.reset_index(drop=True)
X_q = X_q.reset_index(drop=True)

# Combine X and y into a single DataFrame
df = pd.concat([X_o, y_o_series], axis=1)
    
df_q = pd.concat([X_q, y_q_series], axis=1)

In [None]:
dp = DifferentialPrivacy(epsilon)

df_r = df.copy()

if epsilon != 0:
    df_v = dp.apply_differential_privacy(df, numerical_columns, categorical_columns, round_to_int=True)
else:
    df_v = df.copy()

df_vPrime = df_v.copy()

In [None]:
def get_combination(case_id):
    elements = [1, 2, 3, 4]
    all_combinations = []

    # Generate all non-empty combinations of the elements
    for r in range(4, 0, -1):
        all_combinations.extend(combinations(elements, r))

    # Map the case_id to the corresponding combination
    if 0 <= case_id < len(all_combinations):
        return list(all_combinations[case_id])
    else:
        return "Invalid case_id"

In [None]:
steps = ['drop_all_duplicates', 'handle_outliers', 'scale_numerical_features_only', 'resample_data']

steps_in_r = get_combination(case_id)
steps_in_vPrime = get_combination(case_id)
steps_in_v = get_combination(case_id)

steps_r = [steps[i-1] for i in steps_in_r]
steps_vPrime = [steps[i-1] for i in steps_in_vPrime]
steps_v = steps

steps_qr = None
steps_qvPrime = None
steps_qv = None

if 3 in steps_in_r:
    steps_qr = [steps[2]]
if 3 in steps_in_vPrime:
    steps_qvPrime = [steps[2]]
if 3 in steps_in_v:
    steps_qv = [steps[2]]

In [None]:
# Apply the preprocessing pipeline
if steps_r:
    X_r, y_r = pipeline.preprocess(df_r, target_col, numerical_columns, categorical_columns, steps_r, train=True)
else:
    X_r, y_r = df.iloc[:,0:-1], df.iloc[:, -1]
    
if steps_vPrime:
    X_vPrime, y_vPrime = pipeline.preprocess(df_vPrime, target_col, numerical_columns, categorical_columns, steps_vPrime, train=True)
else:
    X_vPrime, y_vPrime = df.iloc[:,0:-1], df.iloc[:, -1]

if steps_v:
    X_v, y_v = pipeline.preprocess(df_v, target_col, numerical_columns, categorical_columns, steps_v, train=True)
else:
    X_v, y_v = df.iloc[:,0:-1], df.iloc[:, -1]

# For querying
if steps_qr:
    X_qr, y_qr = pipeline.preprocess(df_q, target_col, numerical_columns, categorical_columns, steps_qr, train=False, X_train=X_r)
else:
    X_qr, y_qr = df_q.iloc[:, 0:-1], df_q.iloc[:, -1]
    
if steps_qvPrime:
    X_qvPrime, y_qvPrime = pipeline.preprocess(df_q, target_col, numerical_columns, categorical_columns, steps_qvPrime, train=False, X_train=X_vPrime)
else:
    X_qvPrime, y_qvPrime = df_q.iloc[:, 0:-1], df_q.iloc[:, -1]

if steps_qv:
    X_qv, y_qv = pipeline.preprocess(df_q, target_col, numerical_columns, categorical_columns, steps_qv, train=False, X_train=X_v)
else:
    X_qv, y_qv = df_q.iloc[:, 0:-1], df_q.iloc[:, -1]

In [None]:
# Resercher Side
# Train a Logistic Regression classifier
model_r = LogisticRegression(max_iter=1000, random_state=42)
model_r.fit(X_r, y_r)

# Predict and evaluate
y_pred_r = model_r.predict(X_qr)

print("Logistic Regression Without Preprocessing")
print("Accuracy:", accuracy_score(y_qr, y_pred_r))

In [None]:
# Verifier Side
# Train a Logistic Regression classifier
model_vPrime = LogisticRegression(max_iter=1000, random_state=42)
model_vPrime.fit(X_vPrime, y_vPrime)

# Predict and evaluate
y_pred_vPrime = model_vPrime.predict(X_qvPrime)

print("Logistic Regression Without Preprocessing")
print("Accuracy:", accuracy_score(y_qvPrime, y_pred_vPrime))

In [None]:
# Verifier Side
# Train a Logistic Regression classifier
model_v = LogisticRegression(max_iter=1000, random_state=42)
model_v.fit(X_v, y_v)

# Make predictions
y_pred_v = model_v.predict(X_qv)

print("Logistic Regression With Preprocessing")
print("Accuracy:", accuracy_score(y_qv, y_pred_v))

In [None]:
exp_collect = explanation_collection()

explanations_r = exp_collect.get_explanations(X_r, X_qr, model_r)
df_explanations_r = pd.DataFrame(explanations_r)

explanations_vPrime = exp_collect.get_explanations(X_vPrime, X_qvPrime, model_vPrime)
df_explanations_vPrime = pd.DataFrame(explanations_vPrime)

explanations_v = exp_collect.get_explanations(X_v, X_qv, model_v)
df_explanations_v = pd.DataFrame(explanations_v)

In [None]:
def get_CosDist(expA, expB, case_id):
    CosDist_list = []
    
    for i in range(len(expA)):
        exp_a = expA.iloc[i].values.reshape(1, -1)
        exp_b = expB.iloc[i].values.reshape(1, -1)
        distance = cosine_distances(exp_a, exp_b)[0][0]
        CosDist_list.append(distance)
    
    # Convert to a DataFrame for analysis
    CosDist_df = pd.DataFrame(CosDist_list, columns=['Cosine Distance'])
    CosDist_df['case'] = case_id

    return CosDist_df

In [None]:
CosDist_rv = get_CosDist(df_explanations_r, df_explanations_v, case_id)
CosDist_vvPrime = get_CosDist(df_explanations_v, df_explanations_vPrime, case_id)

In [None]:
CosDist_rv.to_csv(f'{path}/dataset{dataset_id}_eps{epsilon}_CosDist_Verify_case{case_id}_rep{rep}.csv', index=False)
CosDist_vvPrime.to_csv(f'{path}/dataset{dataset_id}_eps{epsilon}_CosDist_Train_case{case_id}_rep{rep}.csv', index=False)