Ce code est utilisé pour crées les sous-ensembles d'entrainement des shadow model basée sur la méthode Entropie

In [None]:
import pandas as pd
import numpy as np
import os
from scipy.stats import entropy
from toDopelganger import normalize, privateToDopel

# Load the dataset using pandas
input_file = snakemake.input[0]
dataset = pd.read_parquet(input_file)

# Define parameters for splitting
num_subsets = snakemake.params.num_subsets  # Number of subsets
base_output_directory = "data/shadowData34"

# Function to calculate entropy for a column
def calculate_entropy(column):
    value_counts = column.value_counts(normalize=True)
    return entropy(value_counts, base=2)

# Calculate entropy for all columns
entropy_scores = dataset.apply(calculate_entropy)

# Add a column for the sum of entropy scores across relevant columns
dataset["combined_entropy"] = dataset[entropy_scores.index].sum(axis=1)

# Sort the dataset by combined entropy scores (descending order)
sorted_dataset = dataset.sort_values(by="combined_entropy", ascending=False).reset_index(drop=True)

# Split the sorted dataset into subsets in a round-robin fashion
subsets = [sorted_dataset.iloc[i::num_subsets] for i in range(num_subsets)]

# Process and save each subset
for i, subset in enumerate(subsets, start=1):  # Start numbering from 1
    # Define the folder name for this subset
    shadow_folder = os.path.join(base_output_directory, f"shadow{i}")
    
    # Create the shadow folder if it doesn't exist
    os.makedirs(shadow_folder, exist_ok=True)
    
    # Drop the helper column before further processing
    subset = subset.drop(columns=["combined_entropy"])

    # Convert subset to NumPy array
    subset_array = subset.to_numpy()

    # Define the file path
    file_name = "data_train.npz"
    file_path = os.path.join(shadow_folder, file_name)
    
    # Save the subset as .npz file
    np.savez(file_path, data=subset_array)

    # Normalize and process the subset for DoppelGANger
    genFlags = ~np.isnan(subset_array)  # Generate flags for missing data
    subset_array = np.nan_to_num(subset_array, nan=0.0)  # Replace NaNs with 0s
    subset_array = normalize(subset_array)  # Normalize the subset

    # Save the data in DoppelGANger format
    privateToDopel(subset_array, genFlags, shadow_folder)

    # Optionally, print progress for each subset
    print(f"Processed and saved subset {i} in {shadow_folder}")

print(f"Data subsets saved in {base_output_directory}/shadow*/")
