In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib
import os

FILE_SUFFIX = "DP1"

print("Step 1: Loading the dataset.")
df = pd.read_csv('../datasets/drug_overdose.csv')

print("Step 2: Cleaning the data and engineering features.")

df = df[df['Indicator'] == 'Number of Drug Overdose Deaths'].copy()

df.dropna(subset=['Data Value'], inplace=True)

cols_to_drop = ['State', 'Period', 'Footnote', 'Footnote Symbol',
                'Predicted Value', 'Percent Complete', 'Percent Pending Investigation']
df.drop(columns=cols_to_drop, inplace=True)

print("Step 3: Separating features (X) and target (y).")
y = df['Data Value']
X = df[['State Name', 'Year']]

print("Step 4: Identifying and preprocessing numerical and categorical columns.")
categorical_cols = ['State Name']
numeric_cols = ['Year']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

X_processed = preprocessor.fit_transform(X).toarray()
y_scaler = StandardScaler()
y_processed = y_scaler.fit_transform(y.values.reshape(-1, 1))

os.makedirs("artifacts", exist_ok=True)

joblib.dump(preprocessor.named_transformers_['num'], f"artifacts/feature_scaler{FILE_SUFFIX}.joblib")
print(f"Saved numeric feature scaler to artifacts/feature_scaler{FILE_SUFFIX}.joblib")

joblib.dump(preprocessor.named_transformers_['cat'], f"artifacts/onehot_encoder{FILE_SUFFIX}.joblib")
print(f"Saved one-hot encoder to artifacts/onehot_encoder{FILE_SUFFIX}.joblib")

joblib.dump(y_scaler, f"artifacts/target_scaler{FILE_SUFFIX}.joblib")
print(f"Saved target scaler to artifacts/target_scaler{FILE_SUFFIX}.joblib")

print("\nStep 5: Combining and saving the cleaned data to a CSV file.")
feature_names = preprocessor.get_feature_names_out()

cleaned_df = pd.DataFrame(X_processed, columns=feature_names)
cleaned_df['scaled_data_value'] = y_processed

cleaned_df.to_csv(f'cleaned_overdose_data{FILE_SUFFIX}.csv', index=False)
print(f"File 'cleaned_overdose_data{FILE_SUFFIX}.csv' has been saved successfully.")

print("\nData preprocessing complete.")

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import shap
import os

class RegressionNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.net(x)

print(f"\n--- STARTING TEST CASE 5: KernelExplainer on DP-Teacher ---")
FILE_SUFFIX = "DP1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading DP1 data...")
df = pd.read_csv(f'cleaned_overdose_data{FILE_SUFFIX}.csv')
X = df.drop(columns='scaled_data_value').values.astype(np.float32)
feature_names = df.drop(columns='scaled_data_value').columns.tolist()
INPUT_DIM = X.shape[1]

print("Loading DP-SGD Teacher model...")
model = RegressionNN(INPUT_DIM).to(device)
weights_path = f"artifacts/dpsgd_regression_model{FILE_SUFFIX}.pt"
model.load_state_dict(torch.load(weights_path, map_location=device, weights_only=True))
model.eval()

def predict_wrapper(x_numpy):
    x_tensor = torch.tensor(x_numpy).to(device)
    with torch.no_grad():
        preds = model(x_tensor).cpu().numpy()
    return preds

print("Creating K-Means background data for SHAP...")

X_train_summary = shap.kmeans(X, 50)
print("Running shap.KernelExplainer...")
explainer = shap.KernelExplainer(predict_wrapper, X_train_summary)
shap_values = explainer.shap_values(X[:50]) # Explain the first 50 samples

if isinstance(shap_values, list):
    shap_values = shap_values[0]
if shap_values.ndim > 2:
    shap_values = np.squeeze(shap_values)

mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

print("\n**Average absolute SHAP values (KernelExplainer - DP-Teacher):**")
shap_df = pd.DataFrame({
    'feature': feature_names,
    'mean_abs_shap': mean_abs_shap
}).sort_values('mean_abs_shap', ascending=False)

print(shap_df.head())

SAVE_PATH = f'shap_feature_importance_KERNEL_DP1_Teacher.csv'
shap_df.to_csv(SAVE_PATH, index=False)
print(f"\nSaved KernelExplainer SHAP feature importance to '{SAVE_PATH}'.")
print("\n--- DP-Teacher Model (KernelExplainer) Complete ---")

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import shap
import os

class StudentNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )
    def forward(self, x):
        return self.net(x)

print(f"\n--- STARTING TEST CASE 5: KernelExplainer on KD-Student ---")
FILE_SUFFIX = "DP1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading DP1 data...")
df = pd.read_csv(f'cleaned_overdose_data{FILE_SUFFIX}.csv')
X = df.drop(columns='scaled_data_value').values.astype(np.float32)
feature_names = df.drop(columns='scaled_data_value').columns.tolist()
INPUT_DIM = X.shape[1]

print("Loading KD Student model...")
model = StudentNN(INPUT_DIM).to(device)
weights_path = f"artifacts/kd_student_model{FILE_SUFFIX}.pt"
model.load_state_dict(torch.load(weights_path, map_location=device, weights_only=True))
model.eval()

def predict_wrapper(x_numpy):
    x_tensor = torch.tensor(x_numpy).to(device)
    with torch.no_grad():
        preds = model(x_tensor).cpu().numpy()
    return preds

print("Creating K-Means background data for SHAP...")
X_summary = shap.kmeans(X, 50) 

print("Running shap.KernelExplainer...")
explainer = shap.KernelExplainer(predict_wrapper, X_summary)
shap_values = explainer.shap_values(X[:50])

if isinstance(shap_values, list):
    shap_values = shap_values[0]
if shap_values.ndim > 2:
    shap_values = np.squeeze(shap_values)

mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

print("\n**Average absolute SHAP values (KernelExplainer - KD-Student):**")
shap_df = pd.DataFrame({
    'feature': feature_names,
    'mean_abs_shap': mean_abs_shap
}).sort_values('mean_abs_shap', ascending=False)

print(shap_df.head())

SAVE_PATH = f'shap_feature_importance_KERNEL_DP1_Student.csv'
shap_df.to_csv(SAVE_PATH, index=False)
print(f"\nSaved KernelExplainer SHAP feature importance to '{SAVE_PATH}'.")
print("\n--- KD-Student Model (KernelExplainer) Complete ---")