In [None]:
# ============================================
#  Notebook - Naruto+ (KOI) - Expérience 2 seule
# ============================================

!pip install scikit-learn matplotlib seaborn pandas -q

import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path

# --- Monter Google Drive
from google.colab import drive
drive.mount('/content/drive')

# --- Charger dataset
DATASET_DIR = Path("/content/drive/MyDrive/data_KOI/astronet_dataset")

X_global = np.load(DATASET_DIR / "X_global.npy")
X_local = np.load(DATASET_DIR / "X_local.npy")
y = np.load(DATASET_DIR / "y.npy")

print("Avant filtrage:", np.unique(y, return_counts=True))

# --- Filtrage Confirmed (2) et FP (0)
mask = np.isin(y, [0, 2])
X_global, X_local, y = X_global[mask], X_local[mask], y[mask]
y = np.where(y == 2, 1, 0)

print("Après filtrage:", dict(zip(*np.unique(y, return_counts=True))))
print("Global:", X_global.shape, " Local:", X_local.shape, " Labels:", y.shape)

# --- Split stratifié
Xg_train, Xg_tmp, Xl_train, Xl_tmp, y_train, y_tmp = train_test_split(
    X_global, X_local, y, test_size=0.2, random_state=42, stratify=y
)
Xg_val, Xg_test, Xl_val, Xl_test, y_val, y_test = train_test_split(
    Xg_tmp, Xl_tmp, y_tmp, test_size=0.5, random_state=42, stratify=y_tmp
)

print("Train:", Xg_train.shape, " Val:", Xg_val.shape, " Test:", Xg_test.shape)


# --- Générateur équilibré
def balanced_generator(Xg, Xl, y, batch_size=128):
    pos_idx = np.where(y == 1)[0]
    neg_idx = np.where(y == 0)[0]
    while True:
        idx_pos = np.random.choice(pos_idx, batch_size // 2, replace=True)
        idx_neg = np.random.choice(neg_idx, batch_size // 2, replace=True)
        idx = np.concatenate([idx_pos, idx_neg])
        np.random.shuffle(idx)
        yield {"input_global": Xg[idx], "input_local": Xl[idx]}, y[idx]


# --- Naruto+ configurable
def build_naruto(input_global_shape, input_local_shape,
                 conv_filters=[64,128,256,512],
                 dense_units=1024,
                 dropout_rate=0.5,
                 n_classes=2):

    # Global branch
    inp_global = tf.keras.Input(shape=input_global_shape, name="input_global")
    xg = inp_global
    for f in conv_filters:
        xg = tf.keras.layers.Conv1D(f, 5, activation="relu", padding="same")(xg)
        xg = tf.keras.layers.BatchNormalization()(xg)
        xg = tf.keras.layers.MaxPooling1D(3)(xg)
    xg = tf.keras.layers.GlobalMaxPooling1D()(xg)

    # Local branch
    inp_local = tf.keras.Input(shape=input_local_shape, name="input_local")
    xl = tf.keras.layers.Conv1D(32, 5, activation="relu", padding="same")(inp_local)
    xl = tf.keras.layers.BatchNormalization()(xl)
    xl = tf.keras.layers.MaxPooling1D(2)(xl)
    xl = tf.keras.layers.Conv1D(64, 5, activation="relu", padding="same")(xl)
    xl = tf.keras.layers.BatchNormalization()(xl)
    xl = tf.keras.layers.GlobalMaxPooling1D()(xl)

    # Fusion
    merged = tf.keras.layers.concatenate([xg, xl])
    dense = tf.keras.layers.Dense(dense_units, activation="relu")(merged)
    dense = tf.keras.layers.Dropout(dropout_rate)(dense)
    out = tf.keras.layers.Dense(n_classes, activation="softmax")(dense)

    return tf.keras.Model(inputs=[inp_global, inp_local], outputs=out)


# --- Paramètres Expérience 2
params = {
    "batch_size": 128,
    "lr": 5e-5,
    "dense_units": 1024,
    "dropout_rate": 0.5,
    "conv_filters": [64,128,256,512],
    "loss": "sce",
    "scheduler": "cosine"
}

# --- Entraînement
def train_and_eval(params):
    tf.keras.backend.clear_session()

    model = build_naruto(
        X_global.shape[1:], X_local.shape[1:],
        conv_filters=params["conv_filters"],
        dense_units=params["dense_units"],
        dropout_rate=params["dropout_rate"]
    )

    # Scheduler / LR
    steps_per_epoch = max(1, len(y_train)//params["batch_size"])
    lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
        initial_learning_rate=params["lr"],
        first_decay_steps=steps_per_epoch*5,
        t_mul=2.0,
        m_mul=0.9,
        alpha=1e-6
    )
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])

    # Callbacks
    early_stop = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    checkpoint_path = "/content/drive/MyDrive/models/naruto_KOI_best.keras"
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True)

    steps_per_epoch = len(y_train)//params["batch_size"]

    history = model.fit(
        balanced_generator(Xg_train, Xl_train, y_train, batch_size=params["batch_size"]),
        steps_per_epoch=steps_per_epoch,
        validation_data=({"input_global": Xg_val, "input_local": Xl_val}, y_val),
        epochs=50,
        callbacks=[early_stop, checkpoint],
        verbose=1
    )

    # Recharger le meilleur modèle
    best_model = tf.keras.models.load_model(checkpoint_path)

    # --- Évaluation
    y_pred = np.argmax(best_model.predict({"input_global": Xg_test, "input_local": Xl_test}), axis=1)
    y_proba = best_model.predict({"input_global": Xg_test, "input_local": Xl_test})[:,1]
    auc = roc_auc_score(y_test, y_proba)
    print("\nROC-AUC:", auc)
    print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

    return best_model, history

# --- Lancer Expérience 2
best_model, history = train_and_eval(params)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Avant filtrage: (array([0, 1, 2]), array([3016,  971, 2299]))
Après filtrage: {np.int64(0): np.int64(3016), np.int64(1): np.int64(2299)}
Global: (5315, 2001, 1)  Local: (5315, 201, 1)  Labels: (5315,)
Train: (4252, 2001, 1)  Val: (531, 2001, 1)  Test: (532, 2001, 1)
Epoch 1/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - accuracy: 0.5332 - loss: 2.9324 - val_accuracy: 0.5669 - val_loss: 0.6879
Epoch 2/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5683 - loss: 1.5928 - val_accuracy: 0.5669 - val_loss: 0.6886
Epoch 3/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5747 - loss: 1.2248 - val_accuracy: 0.5669 - val_loss: 0.6899
Epoch 4/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6095 - loss: 1.0415 -

In [None]:
# --- Inférence avec le modèle sauvegardé ---

from tensorflow.keras.models import load_model

# Charger le modèle entraîné
model_path = "/content/drive/MyDrive/models/naruto_KOI_best.keras"
inference_model = load_model(model_path)
print(f"✅ Modèle chargé depuis: {model_path}")

# Prédictions sur le jeu de test
y_pred = np.argmax(inference_model.predict({"input_global": Xg_test, "input_local": Xl_test}), axis=1)
y_proba = inference_model.predict({"input_global": Xg_test, "input_local": Xl_test})[:,1]

# Évaluation
auc = roc_auc_score(y_test, y_proba)
print("\nROC-AUC (inférence):", auc)
print("\nClassification Report (inférence):\n", classification_report(y_test, y_pred, digits=4))

# Exemple sur un seul échantillon
sample_idx = 42
xg_sample = Xg_test[sample_idx:sample_idx+1]
xl_sample = Xl_test[sample_idx:sample_idx+1]
proba = inference_model.predict({"input_global": xg_sample, "input_local": xl_sample})[0]
print(f"\nÉchantillon {sample_idx} → Proba FP={proba[0]:.3f}, Confirmed={proba[1]:.3f}, Prédiction={np.argmax(proba)}")


✅ Modèle chargé depuis: /content/drive/MyDrive/models/naruto_KOI_best.keras
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

ROC-AUC (inférence): 0.7475525482291966

Classification Report (inférence):
               precision    recall  f1-score   support

           0     0.8723    0.4073    0.5553       302
           1     0.5422    0.9217    0.6828       230

    accuracy                         0.6297       532
   macro avg     0.7073    0.6645    0.6190       532
weighted avg     0.7296    0.6297    0.6104       532

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498ms/step

Échantillon 42 → Proba FP=0.408, Confirmed=0.592, Prédiction=1
