XGBoost classifier (Task 1 et Task2) , on change seulement les path des données d'entrainement générées, synthétiques issues du données privées et des targets

In [None]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.spatial.distance import cdist
import joblib
import os

in_file_template = "/home/azerty/snake2-beta-insa-main/snake2-beta-insa-main/data/shadowTrainResults/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-50,extra_checkpoint_freq-50,run-0,sample_len-7,self_norm-False,shadow_id-{i},/generated_samples/epoch_id-999/generated_data_train.npz"
synthetic_file = "/home/azerty/snake2-beta-insa-main/snake2-beta-insa-main/data/publicData/syntheticTask1.npz"
model_file = "saved_model/xgb_classifier_model2.joblib"

THRESHOLD = 0.08

print("--- Debug ---")
try:
    print(f"Loading synthetic data file: {synthetic_file}")
    synthetic_data = np.load(synthetic_file)["data_feature"].reshape((-1, 7))
    print(f"synthetic_data shape: {synthetic_data.shape}")

    data_list = []
    labels_list = []

    # Add synthetic data with label 1
    data_list.append(synthetic_data)
    labels_list.append(np.ones(synthetic_data.shape[0], dtype=int))  # Label all synthetic data as 1

    for i in range(1, 41): 
        in_file = in_file_template.format(i=i)
        print(f"Loading in_file: {in_file}")
        
        shadow_data_generated = np.load(in_file)["data_feature"].reshape((-1, 7))
        print(f"shadow_data_generated shape (i={i}): {shadow_data_generated.shape}")

        # Compute distances between shadow-generated data and synthetic data
        distances = cdist(shadow_data_generated, synthetic_data, metric="euclidean")
        print(f"Computed distances shape (i={i}): {distances.shape}")

        # Find minimum distance for each shadow data row
        min_distances = distances.min(axis=1)
        print(f"Minimum distances shape (i={i}): {min_distances.shape}")

        # Assign labels based on threshold
        close_indices = min_distances <= THRESHOLD
        far_indices = ~close_indices

        # Append nearby shadow data (label 1) and far shadow data (label 0)
        data_list.append(shadow_data_generated[close_indices])
        labels_list.append(np.ones(close_indices.sum(), dtype=int))  # Label as 1
        data_list.append(shadow_data_generated[far_indices])
        labels_list.append(np.zeros(far_indices.sum(), dtype=int))  # Label as 0

    # Combine data and labels from all sources
    combined_data = np.vstack(data_list)
    combined_labels = np.hstack(labels_list)
    print(f"Combined data shape: {combined_data.shape}")
    print(f"Combined labels shape: {combined_labels.shape}")

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(combined_data, combined_labels, test_size=0.3, random_state=42)
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

except Exception as e:
    print("Error during debugging:", e)
    raise

os.makedirs(os.path.dirname(model_file), exist_ok=True)

xgb_classifier = XGBClassifier(
    n_estimators=100,         # Number of trees
    max_depth=6,              # Maximum depth of each tree
    learning_rate=0.1,        # Learning rate for optimization
    subsample=0.8,            # Subsample ratio for training instances
    colsample_bytree=0.8,     # Subsample ratio of features for each tree
    random_state=42           # Random state for reproducibility
)

print("Training the XGBoost classifier...")
xgb_classifier.fit(X_train, y_train)

print(f"Saving the model to {model_file}")
joblib.dump(xgb_classifier, model_file)

y_pred = xgb_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import numpy as np
import joblib
import pandas as pd
from scipy.spatial.distance import cdist

csv_file = "/home/azerty/snake2-beta-insa-main/snake2-beta-insa-main/data/publicData/targetsTask4.csv"
model_file = "saved_model/xgb_classifier_model2.joblib"


print("--- Predicting Labels ---")
try:
    print(f"Loading model from file: {model_file}")
    classifier = joblib.load(model_file)

    print(f"Loading CSV data from file: {csv_file}")
    df = pd.read_csv(csv_file)

    test_data = df.iloc[:, 2:].values # here we take all columns of targets
    print(f"Test data shape: {test_data.shape}")

    predicted_labels = classifier.predict(test_data)
    print(f"Predicted labels for test data: {predicted_labels}")

    output_file = "/home/azerty/snake2-beta-insa-main/snake2-beta-insa-main/data/predicted_labels_test.npy"
    np.save(output_file, predicted_labels)
    print(f"Predicted labels saved to {output_file}")

except Exception as e:
    print("Error during prediction:", e)
    raise


XGBoost classifier (Task 3 et Task4) , on change seulement les path des données d'entrainement générées, synthétiques issues du données privées et des targets

In [None]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.spatial.distance import cdist
import joblib
import os


# File paths
in_file_template = "/home/azerty/snake2-beta-insa-main/snake2-beta-insa-main/data/shadow34/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-50,extra_checkpoint_freq-50,run-0,sample_len-7,self_norm-False,shadow_id-{i},/generated_samples3/epoch_id-999/generated_data_train.npz"
synthetic_file = "/home/azerty/snake2-beta-insa-main/snake2-beta-insa-main/data/publicData/syntheticTask4.npz"
model_file = "saved_model/xgb_classifier_model4.joblib"

# Threshold for Manhattan distance
THRESHOLD = 0.02

# Set NumPy to display full precision for debugging
np.set_printoptions(precision=17, suppress=False)

print("--- Debug ---")
try:
    # Load synthetic data from the target model
    print(f"Loading synthetic data file: {synthetic_file}")
    synthetic_data = np.load(synthetic_file)["data_feature"].astype(np.float64).reshape((-1, 7))
    print(f"synthetic_data shape: {synthetic_data.shape}")

    # Select columns 1:4 from synthetic data
    synthetic_data_subset = synthetic_data[:, 1:4]
    print(f"synthetic_data_subset shape: {synthetic_data_subset.shape}")

    # Initialize lists for data and labels
    data_list = []
    labels_list = []

    # Add synthetic data subset with label 1
    data_list.append(synthetic_data_subset)
    labels_list.append(np.ones(synthetic_data_subset.shape[0], dtype=int))  # Label all synthetic data as 1

    # Iterate through shadow models
    for i in range(1, 27):  # shadow1 to shadow40
        in_file = in_file_template.format(i=i)
        print(f"Loading in_file: {in_file}")
        
        shadow_data_generated = np.load(in_file)["data_feature"].astype(np.float64).reshape((-1, 7))
        print(f"shadow_data_generated shape (i={i}): {shadow_data_generated.shape}")

        # Select columns 4:7 from shadow data
        shadow_data_subset = shadow_data_generated[:, 4:7]
        print(f"shadow_data_subset shape (i={i}): {shadow_data_subset.shape}")

        # Compute Manhattan distances between shadow-generated data and synthetic data
        distances = cdist(shadow_data_subset, synthetic_data_subset, metric="euclidean")
        print(f"Computed distances shape (i={i}): {distances.shape}")

        # Find minimum distance for each shadow data row
        min_distances = distances.min(axis=1)
        print(f"Minimum distances shape (i={i}): {min_distances.shape}")
        print(f"Example min_distances (i={i}): {min_distances[:10]}")

        # Assign labels based on threshold
        close_indices = min_distances <= THRESHOLD
        far_indices = ~close_indices

        # Append nearby shadow data (label 1) and far shadow data (label 0)
        data_list.append(shadow_data_subset[close_indices])
        labels_list.append(np.ones(close_indices.sum(), dtype=int))  # Label as 1
        data_list.append(shadow_data_subset[far_indices])
        labels_list.append(np.zeros(far_indices.sum(), dtype=int))  # Label as 0

    # Combine data and labels from all sources
    combined_data = np.vstack(data_list)
    combined_labels = np.hstack(labels_list)
    print(f"Combined data shape: {combined_data.shape}")
    print(f"Combined labels shape: {combined_labels.shape}")

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(combined_data, combined_labels, test_size=0.3, random_state=42)
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

except Exception as e:
    print("Error during debugging:", e)
    raise

# Ensure the directory exists
os.makedirs(os.path.dirname(model_file), exist_ok=True)

# Define the XGBoost classifier
xgb_classifier = XGBClassifier(
    n_estimators=100,         # Number of trees
    max_depth=6,              # Maximum depth of each tree
    learning_rate=0.1,        # Learning rate for optimization
    subsample=0.8,            # Subsample ratio for training instances
    colsample_bytree=0.8,     # Subsample ratio of features for each tree
    random_state=42           # Random state for reproducibility
)

# Train the XGBoost classifier
print("Training the XGBoost classifier...")
xgb_classifier.fit(X_train, y_train)

# Save the model
print(f"Saving the model to {model_file}")
joblib.dump(xgb_classifier, model_file)

# Predict
y_pred = xgb_classifier.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))