Ce code est utilisé pour crées les sous-ensembles d'entrainement des shadow model basée sur le calcul des distances entres les dataset publiques (publicDatasetTask1-2,publicDatasetTask3-4) et les targets correspondant a chaque task (targets1,target2 pour les task 1 et 2) et (targets3,target4 pour les task 3 et 4)

In [None]:
import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import cdist
import sys

sys.path.append(os.path.abspath("scripts"))
from toDopelganger import normalize, privateToDopel

# Load datasets
public_dataset = pd.read_parquet(
    "/home/azerty/snake2-beta-insa-main/data/publicData/publicDatasetTask3-4.parquet"
)
target_dataset = pd.read_csv(
    "/home/azerty/snake2-beta-insa-main/data/dataGen/targetsTask4.csv"
)

# Extract numerical columns from both datasets
public_data = public_dataset.iloc[:, 4:7].astype(np.float64) 
target_data = target_dataset.iloc[:, 6:9].astype(np.float64) 

base_output_directory = "data/shadowData44"
distance_metric = "cityblock"  # Use Manhattan distance
num_subsets = 40  # Number of subsets to generate
max_distance = 0.15  # Maximum allowed distance

# Step 1: Calculate distances
public_array = public_data.to_numpy()
target_array = target_data.to_numpy()

# Compute distances between each row of public and target datasets
distances = cdist(public_array, target_array, metric=distance_metric)

# Verify precision
print("Distance array dtype:", distances.dtype)

# Calculate the minimum distance for each row in the public dataset
min_distances = distances.min(axis=1)

# Add the minimum distance as a new column in the public dataset
public_dataset["min_distance"] = min_distances

# Step 2: Filter rows based on max_distance
filtered_dataset = public_dataset[public_dataset["min_distance"] <= max_distance]

# Step 3: Sort by distance
sorted_dataset = filtered_dataset.sort_values(by="min_distance", ascending=True, kind='mergesort').reset_index(drop=True)

# Step 4: Split dataset into subsets
subsets = [sorted_dataset.iloc[i::num_subsets] for i in range(num_subsets)]

# Step 5: Process and save subsets
for i, subset in enumerate(subsets, start=1):
    # Define folder name for the subset
    shadow_folder = os.path.join(base_output_directory, f"shadow{i}")
    os.makedirs(shadow_folder, exist_ok=True)

    # Drop the helper column before further processing
    subset = subset.drop(columns=["min_distance"])

    subset_array = subset.to_numpy(dtype=np.float64)

    # Save the subset as a .npz file
    file_name = "data_train.npz"
    file_path = os.path.join(shadow_folder, file_name)
    np.savez(file_path, data=subset_array)

    # Normalize and process the subset for DoppelGANger
    genFlags = ~np.isnan(subset_array)  # Generate flags for missing data
    subset_array = np.nan_to_num(subset_array, nan=0.0)  # Replace NaNs with 0s
    subset_array = normalize(subset_array)  # Normalize the subset

    # Save the data in DoppelGANger format
    privateToDopel(subset_array, genFlags, shadow_folder)

    # Optionally, print progress for each subset
    print(f"Processed and saved subset {i} in {shadow_folder}")

print(f"Data subsets saved in {base_output_directory}/shadow*/")
