# Deuxième méthode

### Splitting des dataset par distance Manhattan
Les données les plus proches des targets parmi le dataset publique seront découpé en subdatasets de 10000 lignes pour l'entraineemnt de chaque shadow model (en tout 26).

La création d'un public_dataset_index va nous permettre de retrouver les lignes membres dans le dataset d'entrainement des shadow models à la dernière étape de notre méthode.

Le résultat seras dans `data/shadowDataFIN`

In [None]:
import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import cdist
import sys
sys.path.append(os.path.abspath("scripts"))
from toDopelganger import normalize, privateToDopel

# Load datasets
public_dataset = pd.read_parquet("../data/publicData/publicDatasetTask1-2.parquet")
target_dataset = pd.read_csv("../data/publicData/targetsTask1.csv")

# Add a public_dataset_index column with line numbers
public_dataset['public_dataset_index'] = range(len(public_dataset))

# Extract numerical columns from both datasets
public_data = public_dataset.drop(columns=['public_dataset_index'])  # Exclude index from calculations
target_data = target_dataset.iloc[:, 2:]  # Skip metadata columns

# Define parameters
base_output_directory = "../data/shadowDataFIN"
distance_metric = "cityblock"  # Change to "cityblock" for Manhattan distance
max_distance = 0.4  # Maximum allowed distance 
rows_per_subset = 10000  # Number of rows per subset


# Step 1: Calculate distances

public_array = public_data.to_numpy()
target_array = target_data.to_numpy()

# Compute distances between each row of public and target datasets
distances = cdist(public_array, target_array, metric=distance_metric)

# Calculate the minimum distance for each row in the public dataset
min_distances = distances.min(axis=1)

# Add the minimum distance as a new column in the public dataset
public_dataset["min_distance"] = min_distances

# Step 2: Filter rows based on max_distance
filtered_dataset = public_dataset[public_dataset["min_distance"] <= max_distance]

# Step 3: Randomize the rows after filtering
shuffled_dataset = filtered_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 4: Split dataset into subsets of 10,000 rows each
subsets = [shuffled_dataset.iloc[i:i + rows_per_subset] for i in range(0, len(shuffled_dataset), rows_per_subset)]

# Step 5: Process and save subsets for training the shadow models
for i, subset in enumerate(subsets, start=1): 
    # Define folder name for the subset
    shadow_folder = os.path.join(base_output_directory, f"shadow{i}")
    os.makedirs(shadow_folder, exist_ok=True)

    # Drop the helper column before further processing
    subset = subset.drop(columns=["min_distance"])

    # Extract index from subset and create subset_array for DopelGanger
    subset_with_index = subset[['public_dataset_index']].to_numpy(dtype=np.int64)
    subset_array = subset.drop(columns=['public_dataset_index']).to_numpy(dtype=np.float64)  

    # Define the file path
    file_name = "data_train.npz"
    file_path = os.path.join(shadow_folder, file_name)

    # Save the subset as .npz file (without index)
    np.savez(file_path, data=subset_array)

    # Normalize and process the subset for DopelGANger
    genFlags = ~np.isnan(subset_array)  # Generate flags for missing data
    subset_array = np.nan_to_num(subset_array, nan=0.0)  # Replace NaNs with 0s
    subset_array = normalize(subset_array)  # Normalize the subset

    # Save the data in DopelGANger format
    privateToDopel(subset_array, genFlags, shadow_folder)

    # Create a CSV for each subset, including the public_dataset_index and line index
    csv_file_path = os.path.join(shadow_folder, "data_train.csv")
    
    # Add public_dataset_index as the first column
    csv_data = np.hstack((np.arange(len(subset))[:, np.newaxis], subset_with_index, subset_array))
    csv_columns = ['index', 'public_dataset_index'] + [f"{j}" for j in range(subset_array.shape[1])]

    pd.DataFrame(csv_data, columns=csv_columns).to_csv(csv_file_path, index=False)
    print(f"Processed and saved subset {i} in {shadow_folder}")


### Entrainement des shadows models et génération de leur données sythétiques

Pour entrainer les shadow models sur leurs datasets respectifs il faut lancer le snakefile avec la commande 

`$ conda activate snakemake` et après `$ snakemake -c all -R all --use-conda` pour lancer les rules dans le snakefile 

Le résultat seras dans `data/shadowtrainResultsFIN`

### Entrainement du classifieur et classification des synthétiques privés

#### Classifier SVC Linéaire

In [None]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import joblib

# File paths
train_file_template = (
    "../data/shadowTrainResultsFIN/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-100,"
    "extra_checkpoint_freq-100,run-0,sample_len-7,self_norm-False,shadow_id-{i},"
    "/generated_samples/epoch_id-999/generated_data_train.npz"
)
test_file_template = (
    "../data/shadowTrainResultsFIN/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-100,"
    "extra_checkpoint_freq-100,run-0,sample_len-7,self_norm-False,shadow_id-{i},"
    "/generated_samples/epoch_id-999/generated_data_test.npz"
)

# private synthetic to be classified
synthetic_task_file = "../data/publicData/syntheticTask1.npz"

# placeholder of the model
model_file = "../saved_model/linear_svc_model_telescop.joblib"

## lines for debugging
## print("--- Debug ---")
try:
    train_data_list = []
    train_labels_list = []
    test_data_list = []
    test_labels_list = []

    for i in range(1, 27):  # Shadow IDs from 1 to 26 (the number of our splitted datasets)
        # Load training data
        train_file = train_file_template.format(i=i)
        # lines for debugging
        #print(f"Loading train_file: {train_file}")
        train_data = np.load(train_file)["data_feature"]
        train_data = train_data.reshape((train_data.shape[0], train_data.shape[1]))[:, :7]
        train_data_list.append(train_data)
        train_labels_list.append(np.full((train_data.shape[0],), i))  # Shadow ID as label

        # Load test data
        test_file = test_file_template.format(i=i)
        # lines for debugging
        #print(f"Loading test_file: {test_file}")
        test_data = np.load(test_file)["data_feature"]
        test_data = test_data.reshape((test_data.shape[0], test_data.shape[1]))[:, :7]
        test_data_list.append(test_data)
        test_labels_list.append(np.full((test_data.shape[0],), i))  # Shadow ID as label

    # Combine data and labels
    train_data_combined = np.vstack(train_data_list)
    train_labels_combined = np.hstack(train_labels_list)
    print(f"Combined train_data shape: {train_data_combined.shape}")
    print(f"Combined train_labels shape: {train_labels_combined.shape}")

    test_data_combined = np.vstack(test_data_list)
    test_labels_combined = np.hstack(test_labels_list)
    print(f"Combined test_data shape: {test_data_combined.shape}")
    print(f"Combined test_labels shape: {test_labels_combined.shape}")
except Exception as e:
    print("Error during data loading:", e)
    raise

# training the classifier
classifier = LinearSVC(random_state=42, max_iter=10000)
classifier.fit(train_data_combined,train_labels_combined)

# Save the model
joblib.dump(classifier, model_file)

# Predict on the test set
y_pred = classifier.predict(test_data_combined)

# Evaluate
print("Accuracy on test data:", accuracy_score(test_labels_combined, y_pred))
print("Classification Report on test data:\n", classification_report(test_labels_combined, y_pred, zero_division=0))

def classify_time_series(file_path):
    """
    Classifies the time series data in the given file.
    Args:
        file_path (str): Path to the file containing time series data.
    Returns:
        np.ndarray: Predicted shadow IDs for the time series.
    """
    try:
        print(f"Loading time series data from: {file_path}")
        synthetic_data = np.load(file_path)["data_feature"]

        # Reshape and trim to 7 columns to 
        synthetic_data = synthetic_data.reshape((synthetic_data.shape[0], synthetic_data.shape[1]))[:, :7]
        print(f"synthetic_data shape: {synthetic_data.shape}")

        predictions = classifier.predict(synthetic_data)
        print("Predictions for the synthetic task file:", predictions)
        return predictions
    except Exception as e:
        print("Error during time series classification:", e)
        return None

# Using the classify_time_series function
try:
    predictions = classify_time_series(synthetic_task_file)
    print("Predictions for synthetic task file:", predictions)
except Exception as e:
    print("Error during classification of synthetic task file:", e)


#### Classifier K-neighbors

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# File paths
train_file_template = (
    "../data/shadowTrainResultsFIN/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-100,"
    "extra_checkpoint_freq-100,run-0,sample_len-7,self_norm-False,shadow_id-{i},"
    "/generated_samples/epoch_id-999/generated_data_train.npz"
)
test_file_template = (
    "../data/shadowTrainResultsFIN/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-100,"
    "extra_checkpoint_freq-100,run-0,sample_len-7,self_norm-False,shadow_id-{i},"
    "/generated_samples/epoch_id-999/generated_data_test.npz"
)

# private synthetic to be classified 
synthetic_task_file = "../data/publicData/syntheticTask1.npz"

# placeholder of the model
model_file = "../saved_model/knn_model_telescop.joblib"

# line for debugging
#print("--- Debug ---")
try:
    train_data_list = []
    train_labels_list = []
    test_data_list = []
    test_labels_list = []

    for i in range(1, 27):  # Shadow IDs from 1 to 27
        # Load training data
        train_file = train_file_template.format(i=i)
        # lines for debugging
        #print(f"Loading train_file: {train_file}")
        train_data = np.load(train_file)["data_feature"]
        train_data = train_data.reshape((train_data.shape[0], train_data.shape[1]))[:, :7]
        train_data_list.append(train_data)
        train_labels_list.append(np.full((train_data.shape[0],), i))  # Shadow ID as label

        # Load test data
        test_file = test_file_template.format(i=i)
        # lines for debugging
        #print(f"Loading test_file: {test_file}")
        test_data = np.load(test_file)["data_feature"]
        test_data = test_data.reshape((test_data.shape[0], test_data.shape[1]))[:, :7]
        test_data_list.append(test_data)
        test_labels_list.append(np.full((test_data.shape[0],), i))  # Shadow ID as label

    # Combine data and labels
    train_data_combined = np.vstack(train_data_list)
    train_labels_combined = np.hstack(train_labels_list)
    print(f"Combined train_data shape: {train_data_combined.shape}")
    print(f"Combined train_labels shape: {train_labels_combined.shape}")

    test_data_combined = np.vstack(test_data_list)
    test_labels_combined = np.hstack(test_labels_list)
    print(f"Combined test_data shape: {test_data_combined.shape}")
    print(f"Combined test_labels shape: {test_labels_combined.shape}")
except Exception as e:
    print("Error during data loading:", e)
    raise

n_neighbors = 5 
classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
classifier.fit(train_data_combined, train_labels_combined)

# Save the model
joblib.dump(classifier, model_file)

# Predict on the test set
y_pred = classifier.predict(test_data_combined)

# Evaluate
print("Accuracy on test data:", accuracy_score(test_labels_combined, y_pred))
print("Classification Report on test data:\n", classification_report(test_labels_combined, y_pred, zero_division=0))

# Using the classify_time_series function
try:
    predictions = classify_time_series(synthetic_task_file)
    print("Predictions for synthetic task file:", predictions)
except Exception as e:
    print("Error during classification of synthetic task file:", e)


In [None]:
# placeholder of the model
model_file = "../saved_model/knn_model_telescop_h-8.joblib"

n_neighbors = 40
classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
classifier.fit(train_data_combined, train_labels_combined)

# Save the model
joblib.dump(classifier, model_file)

# Predict on the test set
y_pred = classifier.predict(test_data_combined)

# Evaluate
print("Accuracy on test data:", accuracy_score(test_labels_combined, y_pred))
print("Classification Report on test data:\n", classification_report(test_labels_combined, y_pred, zero_division=0))

# Using the classify_time_series function
try:
    predictions = classify_time_series(synthetic_task_file)
    print("Predictions for synthetic task file:", predictions)
except Exception as e:
    print("Error during classification of synthetic task file:", e)

## xgboost

In [None]:
import xgboost as xgb
model_file = "../saved_model/xgboost_model.joblib"

# Initialize the XGBoost classifier
classifier = xgb.XGBClassifier(
    objective="multi:softmax",  # Multi-class classification
    num_class=54,              # Number of classes
    max_depth=6,               # Maximum depth of trees
    learning_rate=0.1,         # Learning rate
    n_estimators=100,          # Number of trees
    use_label_encoder=False    # Suppress label encoding warning
)
train_labels_combined -= 1
test_labels_combined -= 1
# Train the XGBoost classifier
classifier.fit(train_data_combined, train_labels_combined)

# Save the model
joblib.dump(classifier, model_file)
# Predict on the test set
y_pred = classifier.predict(test_data_combined)

# Using the classify_time_series function
try:
    predictions = classify_time_series(synthetic_task_file)
    print("Predictions for synthetic task file:", predictions)
except Exception as e:
    print("Error during classification of synthetic task file:", e)

### Classification des données synthétiques

Classification des données synthétiques des membres en prenant les données synthétiques privé les plus proches des données targets.

On utilise le k-neighbour classifier par sa présision plus haute que les autres

In [None]:
import numpy as np
import pandas as pd
from joblib import load 
from collections import Counter
from scipy.spatial.distance import cdist

# Paths to input files
synthetic_task_file = "../data/publicData/syntheticTask1.npz"
model_file = "../saved_model/knn_model_telescop_h-8.joblib"
targets_task_file = "../data/publicData/targetsTask1.csv"
shadow_data_dir_template = "../data/shadowDataFIN/shadow{i}/data_train.csv"
output_vector_file = "../data/output_binary_vector.txt"

# Distance threshold
THRESHOLD = 0.1

# Load the classifier
classifier = load(model_file)

# Load synthetic task data
synthetic_task_data = np.load(synthetic_task_file)["data_feature"]
synthetic_task_data = synthetic_task_data.reshape((synthetic_task_data.shape[0], synthetic_task_data.shape[1]))[:, :7] #Reshape and trim to 7 columns to 

# Load the targets dataset
targets_df = pd.read_csv(targets_task_file)
targets_data = targets_df.iloc[:, 2:]  # Skip metadata columns

# Compute distances between synthetic data and target data
distances = cdist(synthetic_task_data, targets_data, metric="cityblock")
print(f"Computed distances shape: {distances.shape}")

# Find synthetic points that are within the threshold
min_distances = distances.min(axis=1)
close_indices = np.where(min_distances <= THRESHOLD)[0]
print(f"Number of synthetic points close to targets data: {len(close_indices)}")

# Filter synthetic task data to classify only close points
filtered_synthetic_data = synthetic_task_data[close_indices]

# Classify the filtered synthetic task data
try:
    predictions = classifier.predict(filtered_synthetic_data)
except Exception as e:
    print("Error during classification of filtered synthetic task data:", e)
    predictions = None

if predictions is not None:
    # Step 2: Identify the labels by percentage
    label_counts = Counter(predictions)
    total_count = sum(label_counts.values())
    label_percentages = {label: (count / total_count) * 100 for label, count in label_counts.items()}
    sorted_labels_by_percentage = sorted(label_percentages, key=label_percentages.get, reverse=True)
    print("Top labels by percentage:", sorted_labels_by_percentage)

    # Step 3: Load target indices
    targets_df = pd.read_csv(targets_task_file)
    targets_indices = targets_df['index']

    # Step 4: Create the binary vector
    binary_vector = np.zeros(len(targets_indices), dtype=int)

    # Convert targets_indices to a set for faster lookup
    target_set = set(targets_indices)

    # Only loop through shadow datasets corresponding to the classifier's predictions
    predicted_labels = set(predictions)

    for label in sorted_labels_by_percentage:  # Loop only over predicted shadow datasets
        try:
            shadow_data_path = shadow_data_dir_template.format(i=label)
            shadow_df = pd.read_csv(shadow_data_path)

            # Check which shadow indices are in the target set
            shadow_indices = shadow_df['public_dataset_index']
            matching_indices = shadow_indices[shadow_indices.isin(target_set)]

            # Mark matches in the binary vector
            for match in matching_indices:
                binary_vector[targets_indices[targets_indices == match].index[0]] = 1
        except Exception as e:
            print(f"Error processing shadow dataset {label}: {e}")

    # Step 5: Save the binary vector to a text file
    with open(output_vector_file, "w") as f:
        f.write("\n".join(map(str, binary_vector)))

    print(f"Binary vector saved to {output_vector_file}")


## Score de la labelisation des targets

On score la précision de notre attaque en utilisant la formule utilisé dans la compétition ( obtenue dans le fichier `CodabenchBundle/scoringProgram/score.py`)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix
import os
# File paths
guessFile = "../data/output_binary_vector.txt"
truthFile = "../data/privateData/truthTask1.txt"

truth=np.loadtxt(truthFile)
assert os.path.isfile(guessFile)
guesses=np.loadtxt(guessFile)
assert truth.shape==guesses.shape
assert np.bitwise_or(guesses==1, guesses==0).all() 

cm = confusion_matrix(truth, guesses)
tn, fp, fn, tp = cm.ravel()
tpr = tp / (tp + fn)
fpr = fp / (tn + fp)
ma = tpr - fpr
ma = (ma + 1) / 2

print(f"computed score {ma}")
res={"ma":ma}
