On a utilisé plusieurs type de classifier qui vont être presentés ici (mais on a opté a utilisé XGBoost car c'est celui qui a donné les meilleurs résultats)

Linear_SVC

In [None]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.spatial.distance import cdist
import joblib
import os

in_file_template = "data/shadowTrainResults/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-50,extra_checkpoint_freq-50,run-0,sample_len-7,self_norm-False,shadow_id-{i},/generated_samples/epoch_id-999/generated_data_train.npz"
synthetic_file = "/home/azerty/snake2-beta-insa-main/data/publicData/syntheticTask1.npz"
model_file = "saved_model/linear_svc_model.joblib"

THRESHOLD = 0.2

print("--- Debug ---")
try:
    print(f"Loading synthetic data file: {synthetic_file}")
    synthetic_data = np.load(synthetic_file)["data_feature"].reshape((-1, 7))
    print(f"synthetic_data shape: {synthetic_data.shape}")

    data_list = []
    labels_list = []

    # Add synthetic data with label 1
    data_list.append(synthetic_data)
    labels_list.append(np.ones(synthetic_data.shape[0], dtype=int))  # Label all synthetic data as 1

    for i in range(1, 41):
        in_file = in_file_template.format(i=i)
        print(f"Loading in_file: {in_file}")
        
        shadow_data_generated = np.load(in_file)["data_feature"].reshape((-1, 7))
        print(f"shadow_data_generated shape (i={i}): {shadow_data_generated.shape}")

        # Compute distances between shadow-generated data and synthetic data
        distances = cdist(shadow_data_generated, synthetic_data, metric="euclidean")
        print(f"Computed distances shape (i={i}): {distances.shape}")

        # Find minimum distance for each shadow data row
        min_distances = distances.min(axis=1)
        print(f"Minimum distances shape (i={i}): {min_distances.shape}")

        # Assign labels based on threshold
        close_indices = min_distances <= THRESHOLD
        far_indices = ~close_indices

        # Append nearby shadow data (label 1) and far shadow data (label 0)
        data_list.append(shadow_data_generated[close_indices])
        labels_list.append(np.ones(close_indices.sum(), dtype=int))  # Label as 1
        data_list.append(shadow_data_generated[far_indices])
        labels_list.append(np.zeros(far_indices.sum(), dtype=int))  # Label as 0

    combined_data = np.vstack(data_list)
    combined_labels = np.hstack(labels_list)
    print(f"Combined data shape: {combined_data.shape}")
    print(f"Combined labels shape: {combined_labels.shape}")

    X_train, X_test, y_train, y_test = train_test_split(combined_data, combined_labels, test_size=0.3, random_state=42)
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

except Exception as e:
    print("Error during debugging:", e)
    raise

os.makedirs(os.path.dirname(model_file), exist_ok=True)

classifier = LinearSVC(random_state=42, max_iter=10000)
classifier.fit(X_train, y_train)

joblib.dump(classifier, model_file)

y_pred = classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Logistic Regression

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.spatial.distance import cdist
import joblib
import os

in_file_template = "data/shadowTrainResults/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-50,extra_checkpoint_freq-50,run-0,sample_len-7,self_norm-False,shadow_id-{i},/generated_samples/epoch_id-999/generated_data_train.npz"
synthetic_file = "/home/azerty/snake2-beta-insa-main/data/publicData/syntheticTask1.npz"
model_file = "saved_model/logistic_regression_model.joblib"

THRESHOLD = 0.17

print("--- Debug ---")
try:
    print(f"Loading synthetic data file: {synthetic_file}")
    synthetic_data = np.load(synthetic_file)["data_feature"].reshape((-1, 7))
    print(f"synthetic_data shape: {synthetic_data.shape}")
    
    data_list = []
    labels_list = []

    # Add synthetic data with label 1
    data_list.append(synthetic_data)
    labels_list.append(np.ones(synthetic_data.shape[0], dtype=int))  # Label all synthetic data as 1

    for i in range(1, 41):
        in_file = in_file_template.format(i=i)
        print(f"Loading in_file: {in_file}")
        
        shadow_data_generated = np.load(in_file)["data_feature"].reshape((-1, 7))
        print(f"shadow_data_generated shape (i={i}): {shadow_data_generated.shape}")

        # Compute distances between shadow-generated data and synthetic data
        distances = cdist(shadow_data_generated, synthetic_data, metric="cityblock")
        print(f"Computed distances shape (i={i}): {distances.shape}")

        # Find minimum distance for each shadow data row
        min_distances = distances.min(axis=1)
        print(f"Minimum distances shape (i={i}): {min_distances.shape}")

        # Assign labels based on threshold
        close_indices = min_distances <= THRESHOLD
        far_indices = ~close_indices

        # Append nearby shadow data (label 1) and far shadow data (label 0)
        data_list.append(shadow_data_generated[close_indices])
        labels_list.append(np.ones(close_indices.sum(), dtype=int))  # Label as 1
        data_list.append(shadow_data_generated[far_indices])
        labels_list.append(np.zeros(far_indices.sum(), dtype=int))  # Label as 0

    combined_data = np.vstack(data_list)
    combined_labels = np.hstack(labels_list)
    print(f"Combined data shape: {combined_data.shape}")
    print(f"Combined labels shape: {combined_labels.shape}")

    X_train, X_test, y_train, y_test = train_test_split(combined_data, combined_labels, test_size=0.2, random_state=42)
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

except Exception as e:
    print("Error during debugging:", e)
    raise

os.makedirs(os.path.dirname(model_file), exist_ok=True)

log_reg_classifier = LogisticRegression(
    solver='lbfgs',       # Optimizer algorithm (lbfgs is recommended for small datasets)
    max_iter=1000,        # Maximum number of iterations for optimization
    random_state=42       # Random state for reproducibility
)

print("Training the Logistic Regression classifier...")
log_reg_classifier.fit(X_train, y_train)

print(f"Saving the model to {model_file}")
joblib.dump(log_reg_classifier, model_file)

y_pred = log_reg_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Random Forest

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.spatial.distance import cdist
import joblib
import os

in_file_template = "data/shadowTrainResults/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-50,extra_checkpoint_freq-50,run-0,sample_len-7,self_norm-False,shadow_id-{i},/generated_samples/epoch_id-999/generated_data_train.npz"
synthetic_file = "/home/azerty/snake2-beta-insa-main/data/publicData/syntheticTask1.npz"
model_file = "saved_model/rf_classifier_model.joblib"

THRESHOLD = 0.18

print("--- Debug ---")
try:
    print(f"Loading synthetic data file: {synthetic_file}")
    synthetic_data = np.load(synthetic_file)["data_feature"].reshape((-1, 7))
    print(f"synthetic_data shape: {synthetic_data.shape}")

    data_list = []
    labels_list = []

    # Add synthetic data with label 1
    data_list.append(synthetic_data)
    labels_list.append(np.ones(synthetic_data.shape[0], dtype=int))  # Label all synthetic data as 1

    for i in range(1, 41):
        in_file = in_file_template.format(i=i)
        print(f"Loading in_file: {in_file}")
        
        shadow_data_generated = np.load(in_file)["data_feature"].reshape((-1, 7))
        print(f"shadow_data_generated shape (i={i}): {shadow_data_generated.shape}")

        # Compute distances between shadow-generated data and synthetic data
        distances = cdist(shadow_data_generated, synthetic_data, metric="cityblock")
        print(f"Computed distances shape (i={i}): {distances.shape}")

        # Find minimum distance for each shadow data row
        min_distances = distances.min(axis=1)
        print(f"Minimum distances shape (i={i}): {min_distances.shape}")

        # Assign labels based on threshold
        close_indices = min_distances <= THRESHOLD
        far_indices = ~close_indices

        # Append nearby shadow data (label 1) and far shadow data (label 0)
        data_list.append(shadow_data_generated[close_indices])
        labels_list.append(np.ones(close_indices.sum(), dtype=int))  # Label as 1
        data_list.append(shadow_data_generated[far_indices])
        labels_list.append(np.zeros(far_indices.sum(), dtype=int))  # Label as 0

    combined_data = np.vstack(data_list)
    combined_labels = np.hstack(labels_list)
    print(f"Combined data shape: {combined_data.shape}")
    print(f"Combined labels shape: {combined_labels.shape}")

    X_train, X_test, y_train, y_test = train_test_split(combined_data, combined_labels, test_size=0.3, random_state=42)
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

except Exception as e:
    print("Error during debugging:", e)
    raise

os.makedirs(os.path.dirname(model_file), exist_ok=True)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

print("Training the Random Forest classifier...")
rf_classifier.fit(X_train, y_train)

print(f"Saving the model to {model_file}")
joblib.dump(rf_classifier, model_file)

y_pred = rf_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


LightGBM

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.spatial.distance import cdist
import joblib
import os

in_file_template = "data/shadowTrainResults/aux_disc-False,dataset-shadow{i},epoch-1000,epoch_checkpoint_freq-50,extra_checkpoint_freq-50,run-0,sample_len-7,self_norm-False,shadow_id-{i},/generated_samples/epoch_id-999/generated_data_train.npz"
synthetic_file = "/home/azerty/snake2-beta-insa-main/data/publicData/syntheticTask1.npz"
model_file = "saved_model/lgbm_classifier_model.joblib"

THRESHOLD = 0.2

print("--- Debug ---")
try:
    print(f"Loading synthetic data file: {synthetic_file}")
    synthetic_data = np.load(synthetic_file)["data_feature"].reshape((-1, 7))
    print(f"synthetic_data shape: {synthetic_data.shape}")

    data_list = []
    labels_list = []

    # Add synthetic data with label 1
    data_list.append(synthetic_data)
    labels_list.append(np.ones(synthetic_data.shape[0], dtype=int))  # Label all synthetic data as 1

    for i in range(1, 41):
        in_file = in_file_template.format(i=i)
        print(f"Loading in_file: {in_file}")
        
        shadow_data_generated = np.load(in_file)["data_feature"].reshape((-1, 7))
        print(f"shadow_data_generated shape (i={i}): {shadow_data_generated.shape}")

        # Compute distances between shadow-generated data and synthetic data
        distances = cdist(shadow_data_generated, synthetic_data, metric="cityblock")
        print(f"Computed distances shape (i={i}): {distances.shape}")

        # Find minimum distance for each shadow data row
        min_distances = distances.min(axis=1)
        print(f"Minimum distances shape (i={i}): {min_distances.shape}")

        # Assign labels based on threshold
        close_indices = min_distances <= THRESHOLD
        far_indices = ~close_indices

        # Append nearby shadow data (label 1) and far shadow data (label 0)
        data_list.append(shadow_data_generated[close_indices])
        labels_list.append(np.ones(close_indices.sum(), dtype=int))  # Label as 1
        data_list.append(shadow_data_generated[far_indices])
        labels_list.append(np.zeros(far_indices.sum(), dtype=int))  # Label as 0

    combined_data = np.vstack(data_list)
    combined_labels = np.hstack(labels_list)
    print(f"Combined data shape: {combined_data.shape}")
    print(f"Combined labels shape: {combined_labels.shape}")

    X_train, X_test, y_train, y_test = train_test_split(combined_data, combined_labels, test_size=0.3, random_state=42)
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

except Exception as e:
    print("Error during debugging:", e)
    raise

os.makedirs(os.path.dirname(model_file), exist_ok=True)

lgbm_classifier = lgb.LGBMClassifier(
    n_estimators=100,         # Number of boosting iterations
    max_depth=6,              # Maximum depth of each tree
    learning_rate=0.1,        # Learning rate
    subsample=0.8,            # Subsample ratio for training instances
    colsample_bytree=0.8,     # Subsample ratio of features
    random_state=42           # Random state for reproducibility
)

print("Training the LightGBM classifier...")
lgbm_classifier.fit(X_train, y_train)

print(f"Saving the model to {model_file}")
joblib.dump(lgbm_classifier, model_file)

y_pred = lgbm_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
