In [None]:
from tqdm.auto import tqdm
import os
import numpy as np
import pandas as pd
import time
from datetime import datetime
import pandas as pd
import logging

logging.basicConfig(level=logging.WARNING)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_DISABLE_MKL'] = '1'


%reload_ext autoreload
%autoreload 1
%aimport -tensorflow -keras

from tools import *
# Import specific classes to avoid circular imports
from methods.preprocessing.training import TrainConfig
from methods.schema import create_attribute_schema
from methods.MDAV import *
from model_evaluation import *
from tools.write import *


BASE_DIR = os.getcwd()
LOCAL_DIR = None #f'{os.path.expanduser('~')}/data/adult_synthetic.csv'
LOCAL_DIR = LOCAL_DIR if LOCAL_DIR is not None and os.path.exists(LOCAL_DIR) else BASE_DIR

# Disable specific TensorFlow warnings
import logging
logging.getLogger('tensorflow').disabled = True


%matplotlib inline

# MDAV-LD-Mixed experiments

This notebook demonstrates how to run the MDAV_LD_Mixed anonymization experiments and visualize results.

Quick guide:
1. Edit the simulation parameters below (methods, k range, dataset settings).
2. Run the main loop to execute experiments and store results.
3. Generate summaries and plots using the final cells in the notebook.

Notes:
- The `methods` dict should contain instances of `MDAV_LD_Mixed` from `methods.MDAV`.
- Use `RESTORE_OUTPUT` to continue a previous run (if present in the outputs folder).

# Results

This section contains the results summary generated by the experiment run.
After executing the main loop the notebook will produce:
- A JSON results file with detailed per-run metrics.
- Aggregated summaries via `store_general_metrics`.
- Plots created by `create_plots` and `plot_mdav_ld_mixed_phase_times_mean`.

Run the notebook from top to bottom to ensure all results and visualizations are generated successfully.

## Simulation parameters explanation

Configure the simulation parameters in the code cell below. Key items to review:

- `METHOD`: short name for the method used to build output directories.
- `methods`: mapping of display names to algorithm instances (use `MDAV_LD_Mixed` instances).
- `repetitions`, `n_iterations`, `k_iterations`, `k_list`: control experiment repeats and ranges.
- `RESTORE_OUTPUT`: if provided, the notebook will attempt to restore previous results from the corresponding outputs directory.
- `GENERATE_DATA`, `ucimlrepo_id`: if you want to generate synthetic data or pull from UCI ML repo.

In [None]:
################################################################################
# Simulation parameters
METHOD = 'MDAV'
methods = {
"MDAV_LD_Mixed_32_train-2048": MDAV_LD_Mixed(target_dtype=np.float32, encoder_config=TrainConfig(batch_size=2048, inference_batch_size=2048), l_diversity=2),
"MDAV_LD_Mixed_16_train-2048": MDAV_LD_Mixed(target_dtype=np.float16, encoder_config=TrainConfig(batch_size=2048, inference_batch_size=2048), l_diversity=2),

"MDAV_LD_Mixed_32_train-1024": MDAV_LD_Mixed(target_dtype=np.float32, encoder_config=TrainConfig(batch_size=1024, inference_batch_size=2048), l_diversity=2),
"MDAV_LD_Mixed_16_train-1024": MDAV_LD_Mixed(target_dtype=np.float16, encoder_config=TrainConfig(batch_size=1024, inference_batch_size=2048), l_diversity=2),

"MDAV_LD_Mixed_32_train-512": MDAV_LD_Mixed(target_dtype=np.float32, encoder_config=TrainConfig(batch_size=512, inference_batch_size=2048), l_diversity=2),
"MDAV_LD_Mixed_16_train-512": MDAV_LD_Mixed(target_dtype=np.float16, encoder_config=TrainConfig(batch_size=512, inference_batch_size=2048), l_diversity=2),

"MDAV_LD_Mixed_32_train-256": MDAV_LD_Mixed(target_dtype=np.float32, encoder_config=TrainConfig(batch_size=256, inference_batch_size=2048), l_diversity=2),
"MDAV_LD_Mixed_16_train-256": MDAV_LD_Mixed(target_dtype=np.float16, encoder_config=TrainConfig(batch_size=256, inference_batch_size=2048), l_diversity=2),
}

# General parameters
repetitions = 3
n_iterations = 1
k_iterations = 15
k_list = [10, 100]
RESTORE_OUTPUT = None

# Parameters for real data
ucimlrepo_id = 2  # Adult dataset

# Parameters for generating the datasets (if not ucimlrepo_id)
GENERATE_DATA = False

n_features, n_instances = None, None
if GENERATE_DATA:
    n_features = 10
    n_instances = 100000

# Sensitive attributes configuration, for MDAV_MIXED set None
SA = ['education']


generalization_technique = [
    'permutation', # 'range',       # age (numerical) - creates age ranges like "25-35"
    'permutation',        # workclass (categorical)
    'permutation',    # fnlwgt (numerical)
    'permutation',        # education (categorical)
    'permutation',      # education-num (numerical)
    'permutation',        # marital-status (categorical)
    'permutation',        # occupation (categorical)
    'permutation',        # relationship (categorical)
    'permutation',        # race (categorical)
    'permutation',        # sex (categorical)
    'permutation', # 'range',      # capital-gain (numerical) - creates ranges like "0-5000"
    'permutation', # 'range',      # capital-loss (numerical) - creates ranges like "0-5000"
    'permutation', # 'range',       # hours-per-week (numerical) - creates ranges like "35-45"
    'permutation'         # native-country (categorical)
]




## Experiment configuration

This cell defines named experiment presets in `EXPERIMENT_CONFIG`. Each preset maps to:

- `qi`: list of column names that should be treated as quasi-identifiers for the experiment.
- `remove_sa_mdav`: legacy boolean flag retained for backward compatibility. The `SA-MDAV` variant is not included in this repository (it was provided). Therefore this flag is informational and has no effect in this repo.

How to add a new experiment preset:
1. Add a new key to `EXPERIMENT_CONFIG` with the desired `qi` and `remove_sa_mdav` values (if needed for compatibility).
2. Set `EXPERIMENT` to the new key or set `RESTORE_OUTPUT` to an existing run folder to re-use outputs.

Notes:
- If `RESTORE_OUTPUT` is set, the notebook will attempt to restore previous results instead of creating a new run directory.
- Experiment presets only modify behavior in the notebook — they do not change saved results files.

In [None]:
# ============================================================================
# Experiment Configuration
# ============================================================================
# Available experiments:
EXPERIMENT = "SA_MDAV_vs_MDAV_LD_Mixed_2_QI"
# EXPERIMENT = "SA_MDAV_vs_MDAV_LD_Mixed_7_QI"
# EXPERIMENT = "SA_MDAV_vs_MDAV_LD_Mixed_10_QI"
# ============================================================================


# Define experiment configurations
EXPERIMENT_CONFIG = {
    "SA_MDAV_vs_MDAV_LD_Mixed_2_QI": {
        "qi": ['occupation', 'native-country'],
        "remove_sa_mdav": False
    },
    "SA_MDAV_vs_MDAV_LD_Mixed_7_QI": {
        "qi": ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'],
        "remove_sa_mdav": False
    },
    "SA_MDAV_vs_MDAV_LD_Mixed_10_QI": {
        "qi": ['age', 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'native-country'],
        "remove_sa_mdav": True
    }
}

# Apply experiment configuration (clean and robust)
QI = None
if RESTORE_OUTPUT is None:
    # If an experiment name is provided, try to load its config
    if EXPERIMENT:
        try:
            config = EXPERIMENT_CONFIG[EXPERIMENT]
        except KeyError:
            raise ValueError(f"Invalid EXPERIMENT: {EXPERIMENT}. Please choose a valid experiment or set RESTORE_OUTPUT.")

        RESTORE_OUTPUT = EXPERIMENT
        QI = config.get("qi")

        # Remove legacy SA-MDAV entry from methods if requested
        if config.get("remove_sa_mdav") and "SA-MDAV" in methods:
            methods.pop("SA-MDAV", None)
    else:
        # No experiment selected and no restore target
        raise ValueError("Please set RESTORE_OUTPUT for the experiment or choose a valid EXPERIMENT.")
else:
    # When restoring from output, keep EXPERIMENT unset to indicate restore mode
    EXPERIMENT = None


## Output directories and restoring previous runs

This section prepares output directories via `load_dirs` and either restores an existing configuration or creates a new JSON configuration file with `create_json_file`.

Key outputs created/used:
- `images_dir`: location for generated images.
- `results_path`: JSON file storing experiment results and metadata.
- `tmp_path`: temporary pickle used for storing intermediate timings.
- `X_path`: dataset path (either local CSV or generated/UCI repo).

If `RESTORE_OUTPUT` is set and a matching run exists in `outputs/<METHOD>/`, the notebook will attempt to reuse that run's results and configuration. If not found and `restore_or_create=True`, a new run folder will be created.

Before running experiments:
- Verify `METHOD` and `LOCAL_DIR` are set correctly.

In [None]:
# ============================================================================
# Load directories and parameters
# ============================================================================
images_dir, results_path, tmp_path, X_path = load_dirs(
    METHOD, BASE_DIR, local_dir=LOCAL_DIR, 
    restore_output=RESTORE_OUTPUT, restore_or_create=True
)

# Load or create parameters
if RESTORE_OUTPUT and os.path.exists(results_path):
    # Restore existing parameters
    params = get_parameters(results_path)
    n_features, repetitions, n_iterations, k_iterations, n_list, k_list, ucimlrepo_id, columns_names, QI, SA, generalization_technique = params
    status = "Restored"
else:
    # Create new parameters file
    params = create_json_file(
        results_path, repetitions, n_iterations, k_iterations, k_list,
        ucimlrepo_id=ucimlrepo_id, n_instances=n_instances, n_features=n_features,
        QI_names=QI, SA_names=SA, generalization_technique=generalization_technique,
        verbose=True 
    )
    n_features, repetitions, n_iterations, k_iterations, n_list, k_list, ucimlrepo_id, columns_names, QI, SA, generalization_technique = params
    status = "Created"

# Create attribute schema
attributeSchema = {
    "columns_names": columns_names,
    "QI": QI,
    "generalization_technique": generalization_technique,
    "sensitive_attributes_names": SA,
    "verbose": False
}
# Check  QI and SA configuration
# Expected usage for MDAV_LD_Mixed (also works for MDAV_Mixed, but are set to None in model_evaluation.py)
create_attribute_schema(
    columns_names=columns_names,
    QI=QI,
    generalization_technique=generalization_technique,
    sensitive_attributes_types=SA,
    verbose=True  # or omit since it's the default
)

# Display parameters summary
print(f"\n{'='*80}")
print(f"[{status}] Configuration Summary")
print(f"{'='*80}")
print(f"Features        : {n_features}")
print(f"Repetitions     : {repetitions}")
print(f"Iterations      : N={n_iterations}, K={k_iterations}")
print(f"K range         : {k_list}")
print(f"N list          : {n_list}")
print(f"QI attributes   : {len([qi for qi in QI if qi is not None])}/{len(QI)} columns")
print(f"SA attributes   : {len([sa for sa in SA if sa is not None])}/{len(SA)} columns")
print(f"UCI ML repo ID  : {ucimlrepo_id}")
print(f"{'='*80}\n")
print("QI attributes    : ", QI)
print("SA attributes    : ", SA)
print(f"{'='*80}\n")
# Generate n_list and k_list arrays
if n_iterations > 1:
    n_list = np.linspace(n_list[0], n_list[1], n_iterations, dtype=int)
k_list = np.linspace(k_list[0], k_list[1], k_iterations, dtype=int)


## Main loop

This is the notebook's main execution loop. For each requested dataset size `n` and for each `k` in `k_list` it will:

1. Optionally skip tests if results already exist (`should_skip_test_iteration`).
2. Load/create the dataset (`X_path_file`).
3. For each method in `methods`, build an attribute schema and call `run_model_evaluation`.
4. Write per-run results to `results_path` and update progress bars.

Common failure modes:
- Missing `columns_names` or incorrect QI/SA mapping will raise schema errors. Ensure `columns_names` matches your dataset.

In [None]:

%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}
</style>

In [None]:
pbar_X = tqdm(total=n_iterations, desc="Running Tests", leave=False)

# Main loop
print(f'N of features: {n_features}')
for n in n_list:
    print("+" * 100)
    
    if should_skip_test_iteration(results_path, n, methods, k_list):
        pbar_X.update(1)
        pbar_X.refresh()
        print(f"Skipping X = {n}, all tests completed")
        continue

    X_path_file = os.path.join(X_path, f'X_n_{n}_features_{n_features}.pkl') if GENERATE_DATA else LOCAL_DIR

    print(f"Running tests for X = {n}")
    
    for idx, k in enumerate(k_list):
        method_list = load_existing_results_for_n(results_path, n, k=k) if RESTORE_OUTPUT else None

        if method_list and np.all([method in method_list for method in methods.keys()]):
            pbar_X.update(1)
            pbar_X.refresh()
            continue


        if k >= n // 2: # Skipping k values that less than 2 clusters
            continue
        
        print("#" * 100, f"\nTesting k = {k} for X = {n}")
        init_time = datetime.now()
        pbar = tqdm(total=len(methods), desc=f"Test {idx + 1}/{k_iterations}", leave=False)
        
        for method, instance in methods.items():
            if method_list and method in method_list:
                pbar.update(1)
                pbar.refresh()
                continue       

            # Reset clean schema for each method
            schema_for_method = attributeSchema.copy() if attributeSchema is not None and isinstance(attributeSchema, dict) else None

            # Create attribute schema from the copy (original attributeSchema remains intact for subsequent models)
            schema_for_method = create_attribute_schema(
                columns_names=schema_for_method.get("columns_names"),
                QI=schema_for_method.get("QI"),
                generalization_technique=schema_for_method.get("generalization_technique"),
                sensitive_attributes_types=schema_for_method.get("sensitive_attributes_names"),
                verbose=schema_for_method.get("verbose", False)
            )
            
            result, clusters, instance = run_model_evaluation(
                instance, method, X_path_file, n_features, n, k, tmp_path, repetitions, 
                ucimlrepo_id=ucimlrepo_id, attributeSchema=schema_for_method, show_progress=True,
            )
            write_to_json(results_path, result, n, k)

            del result, clusters
            pbar.update(1)
        
        pbar.close()
        end_time = datetime.now()
        print(f"Loop time: {end_time - init_time} [ST: {init_time.strftime('%Y-%m-%d %H:%M:%S')}, ET: {end_time.strftime('%Y-%m-%d %H:%M:%S')}]")

    pbar_X.update(1)

pbar_X.close()


In [None]:
results = store_general_metrics(results_path) 

## Plotting

In [None]:
create_plots(results_path, images_dir, f"{METHOD}_plots")

In [None]:
plot_mdav_ld_mixed_phase_times_mean(results_path, images_dir, "MDAV_plots_phases")