# IMPORT DES BIBLIOTHÈQUES UTILES  + CONFIG

In [8]:
import os, random, shutil
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import (
    confusion_matrix, classification_report,
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, roc_auc_score
)

# --- Reproductibilité : même split + mêmes résultats autant que possible
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# --- Affichage GPU (si dispo)
print("TensorFlow:", tf.__version__)
print("GPU:", tf.config.list_physical_devices("GPU"))

# --- Hyperparams "raisonnables" (à adapter si CPU)
IMG_SIZE = (224, 224)   # 224x224 car VGG16/ResNet50 attendent souvent ce format
BATCH_SIZE = 32
EPOCHS_SCRATCH = 15
EPOCHS_TL_1 = 8         # transfer learning phase 1 (base gelée)
EPOCHS_TL_2 = 8         # fine-tuning phase 2 (dégel partiel)



TensorFlow: 2.19.0
GPU: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# CHARGEMENT DONNÉES BRUTES

In [None]:
from google.colab import files
uploaded = files.upload()  # Zip


# Pour déZipper mon fichier brute

In [None]:
!ls -la

!mkdir -p /content/datasets
!unzip -q malaria_hematie_dataset.zip -d /content/datasets/
!ls -la /content/datasets/

In [None]:
# Chemin du zip (après upload Colab)
ZIP_PATH = "/content/malaria_hematie_dataset.zip"

# Où dézipper
UNZIP_DIR = Path("/content/malaria_hematie_dataset")

# Dézipper sans poser de question (option -o = overwrite)
!unzip -o "{ZIP_PATH}" -d "{UNZIP_DIR}"

# Le dataset final doit contenir 2 sous-dossiers : parasitized/ et uninfected/
# On cherche automatiquement le dossier racine qui contient ces 2 classes.
def find_dataset_root(base_dir: Path):
    for p in base_dir.rglob("*"):
        if p.is_dir() and (p/"parasitized").exists() and (p/"uninfected").exists():
            return p
    return None

DATA_ROOT = find_dataset_root(UNZIP_DIR)
print("DATA_ROOT =", DATA_ROOT)

assert DATA_ROOT is not None, "Impossible de trouver le dossier contenant parasitized/ et uninfected/"

print("Contenu DATA_ROOT:", [x.name for x in DATA_ROOT.iterdir()])


 #SPLIT TRAIN / VAL / TEST



In [None]:
SPLIT_DIR = Path("/content/splits_malaria")  # dossier final structuré
TRAIN_DIR = SPLIT_DIR / "train"
VAL_DIR   = SPLIT_DIR / "val"
TEST_DIR  = SPLIT_DIR / "test"

CLASSES = ["parasitized", "uninfected"]

# Ratios simples et standard
TEST_RATIO = 0.15
VAL_RATIO  = 0.15  # sur le total (pas sur le train)

def make_clean_dir(path: Path):
    if path.exists():
        shutil.rmtree(path)
    path.mkdir(parents=True, exist_ok=True)

# On repart propre
make_clean_dir(SPLIT_DIR)
for split in [TRAIN_DIR, VAL_DIR, TEST_DIR]:
    for c in CLASSES:
        (split / c).mkdir(parents=True, exist_ok=True)

def split_and_copy_class(class_name: str):
    src = DATA_ROOT / class_name
    files = [p for p in src.iterdir() if p.is_file()]
    random.shuffle(files)  # reproductible grâce au SEED

    n_total = len(files)
    n_test = int(n_total * TEST_RATIO)
    n_val  = int(n_total * VAL_RATIO)
    n_train = n_total - n_test - n_val

    test_files  = files[:n_test]
    val_files   = files[n_test:n_test+n_val]
    train_files = files[n_test+n_val:]

    # Copier les fichiers dans les bons dossiers (simple et clair)
    for p in train_files:
        shutil.copy2(p, TRAIN_DIR / class_name / p.name)
    for p in val_files:
        shutil.copy2(p, VAL_DIR / class_name / p.name)
    for p in test_files:
        shutil.copy2(p, TEST_DIR / class_name / p.name)

    return n_train, n_val, n_test

counts = {}
for c in CLASSES:
    counts[c] = split_and_copy_class(c)

print("Split counts (train, val, test) par classe:")
for c in CLASSES:
    print(c, ":", counts[c])

# Petit check : total par split
def count_images(split_dir):
    return {c: len(list((split_dir/c).glob("*"))) for c in CLASSES}

print("TRAIN:", count_images(TRAIN_DIR))
print("VAL  :", count_images(VAL_DIR))
print("TEST :", count_images(TEST_DIR))


# Data Préparation


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Augmentation UNIQUEMENT sur train pour éviter fuite / évaluation biaisée
datagen_train_scratch = ImageDataGenerator(
    rescale=1./255,            # normalisation 0-1
    rotation_range=20,
    zoom_range=0.15,
    horizontal_flip=True
)

# Pour val/test : PAS d'augmentation, juste normalisation
datagen_eval_scratch = ImageDataGenerator(rescale=1./255)

train_gen_scratch = datagen_train_scratch.flow_from_directory(
    TRAIN_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="binary",
    shuffle=True,
    seed=SEED
)

val_gen_scratch = datagen_eval_scratch.flow_from_directory(
    VAL_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="binary",
    shuffle=False  # important pour évaluation (alignement y_true / y_pred)
)

test_gen_scratch = datagen_eval_scratch.flow_from_directory(
    TEST_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="binary",
    shuffle=False
)

# --- Visualiser quelques images augmentées (extrait d'un batch)
x_batch, y_batch = next(train_gen_scratch)
plt.figure(figsize=(10, 6))
for i in range(6):
    plt.subplot(2, 3, i+1)
    plt.imshow(x_batch[i])
    plt.title(f"label={int(y_batch[i])}")
    plt.axis("off")
plt.tight_layout()
plt.show()

print("Mapping classes -> index :", train_gen_scratch.class_indices)


#Modèle 1

In [None]:
# MODELE 1 : CNN FROM SCRATCH


def build_scratch_cnn(input_shape=(224,224,3)):
    model = keras.Sequential([
        keras.layers.Input(shape=input_shape),

        # Bloc conv 1
        keras.layers.Conv2D(32, (3,3), activation="relu"),
        keras.layers.MaxPooling2D((2,2)),

        # Bloc conv 2
        keras.layers.Conv2D(64, (3,3), activation="relu"),
        keras.layers.MaxPooling2D((2,2)),

        # Aplatir + classif
        keras.layers.Flatten(),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Dropout(0.5),             # anti-overfitting (obligatoire dans le sujet)
        keras.layers.Dense(1, activation="sigmoid")  # binaire
    ])
    return model

model_scratch = build_scratch_cnn(input_shape=(*IMG_SIZE, 3))
model_scratch.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model_scratch.summary()

# Callbacks obligatoires : EarlyStopping + LR decay
cb_early = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=4,
    restore_best_weights=True
)

cb_lr = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    min_lr=1e-6
)

# Sauvegarde du meilleur modèle pendant l'entraînement (pratique + propre)
ckpt_scratch = keras.callbacks.ModelCheckpoint(
    filepath="/content/weights_scratch_best.weights.h5",
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True
)

history_scratch = model_scratch.fit(
    train_gen_scratch,
    validation_data=val_gen_scratch,
    epochs=EPOCHS_SCRATCH,
    callbacks=[cb_early, cb_lr, ckpt_scratch],
    verbose=1
)

# Courbes
plt.figure(figsize=(10,4))
plt.plot(history_scratch.history["loss"], label="train_loss")
plt.plot(history_scratch.history["val_loss"], label="val_loss")
plt.legend(); plt.title("Scratch - Loss"); plt.show()

plt.figure(figsize=(10,4))
plt.plot(history_scratch.history["accuracy"], label="train_acc")
plt.plot(history_scratch.history["val_accuracy"], label="val_acc")
plt.legend(); plt.title("Scratch - Accuracy"); plt.show()

# Sauvegarde finale des poids (en plus du best)
model_scratch.save_weights("/content/weights_scratch_final.weights.h5")
print(" Weights scratch saved.")


In [None]:
# VGG16/ResNet50


from tensorflow.keras.applications.vgg16 import preprocess_input as vgg_preprocess
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess

# Train : augmentation + preprocess_input
datagen_train_vgg = ImageDataGenerator(
    preprocessing_function=vgg_preprocess,
    rotation_range=20,
    zoom_range=0.15,
    horizontal_flip=True
)
datagen_eval_vgg = ImageDataGenerator(preprocessing_function=vgg_preprocess)

train_gen_vgg = datagen_train_vgg.flow_from_directory(
    TRAIN_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode="binary", shuffle=True, seed=SEED
)
val_gen_vgg = datagen_eval_vgg.flow_from_directory(
    VAL_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode="binary", shuffle=False
)
test_gen_vgg = datagen_eval_vgg.flow_from_directory(
    TEST_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode="binary", shuffle=False
)

# ResNet : même logique, preprocess différent
datagen_train_resnet = ImageDataGenerator(
    preprocessing_function=resnet_preprocess,
    rotation_range=20,
    zoom_range=0.15,
    horizontal_flip=True
)
datagen_eval_resnet = ImageDataGenerator(preprocessing_function=resnet_preprocess)

train_gen_resnet = datagen_train_resnet.flow_from_directory(
    TRAIN_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode="binary", shuffle=True, seed=SEED
)
val_gen_resnet = datagen_eval_resnet.flow_from_directory(
    VAL_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode="binary", shuffle=False
)
test_gen_resnet = datagen_eval_resnet.flow_from_directory(
    TEST_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode="binary", shuffle=False
)


#MODELE 2


In [None]:
from tensorflow.keras.applications import VGG16

base_vgg = VGG16(weights="imagenet", include_top=False, input_shape=(*IMG_SIZE, 3))
base_vgg.trainable = False  # Phase 1 : feature extraction

model_vgg = keras.Sequential([
    base_vgg,
    keras.layers.GlobalAveragePooling2D(),  # mieux que Flatten (moins de params)
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation="sigmoid")
])

model_vgg.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

ckpt_vgg = keras.callbacks.ModelCheckpoint(
    filepath="/content/weights_vgg_best.weights.h5",
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True
)

# Phase 1 (base gelée)
hist_vgg_1 = model_vgg.fit(
    train_gen_vgg,
    validation_data=val_gen_vgg,
    epochs=EPOCHS_TL_1,
    callbacks=[cb_early, cb_lr, ckpt_vgg],
    verbose=1
)

# Phase 2 : fine-tuning (dégeler un peu les dernières couches)
base_vgg.trainable = True
# On gèle la majorité, on dégèle seulement les dernières couches (simple)
for layer in base_vgg.layers[:-4]:
    layer.trainable = False

# LR plus petit pendant fine-tuning (très important)
model_vgg.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

hist_vgg_2 = model_vgg.fit(
    train_gen_vgg,
    validation_data=val_gen_vgg,
    epochs=EPOCHS_TL_2,
    callbacks=[cb_early, cb_lr, ckpt_vgg],
    verbose=1
)

model_vgg.save_weights("/content/weights_vgg_final.weights.h5")
print(" Weights VGG saved.")


#MODELE 3


In [None]:
from tensorflow.keras.applications import ResNet50

base_resnet = ResNet50(weights="imagenet", include_top=False, input_shape=(*IMG_SIZE, 3))
base_resnet.trainable = False

model_resnet = keras.Sequential([
    base_resnet,
    keras.layers.GlobalAveragePooling2D(),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation="sigmoid")
])

model_resnet.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

ckpt_resnet = keras.callbacks.ModelCheckpoint(
    filepath="/content/weights_resnet_best.weights.h5",
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True
)

# Phase 1
hist_resnet_1 = model_resnet.fit(
    train_gen_resnet,
    validation_data=val_gen_resnet,
    epochs=EPOCHS_TL_1,
    callbacks=[cb_early, cb_lr, ckpt_resnet],
    verbose=1
)

# Phase 2 : fine-tuning (dégeler quelques couches finales)
base_resnet.trainable = True
for layer in base_resnet.layers[:-10]:
    layer.trainable = False

model_resnet.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

hist_resnet_2 = model_resnet.fit(
    train_gen_resnet,
    validation_data=val_gen_resnet,
    epochs=EPOCHS_TL_2,
    callbacks=[cb_early, cb_lr, ckpt_resnet],
    verbose=1
)

model_resnet.save_weights("/content/weights_resnet_final.weights.h5")
print(" Weights ResNet saved.")


#EVALUATION GLOBALE

In [None]:
def evaluate_model_on_test(model, test_gen, threshold=0.5):
    """
    - test_gen.shuffle doit être False
    - y_true = test_gen.classes (labels 0/1)
    - y_prob = model.predict -> proba classe 1
    """
    y_true = test_gen.classes

    # Nombre d'itérations nécessaires pour parcourir tout le test
    steps = int(np.ceil(test_gen.samples / test_gen.batch_size))

    y_prob = model.predict(test_gen, steps=steps, verbose=0).ravel()
    y_pred = (y_prob >= threshold).astype(int)

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Métriques classiques
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)  # = sensibilité
    f1 = f1_score(y_true, y_pred, zero_division=0)

    # Sensibilité / Spécificité
    sensitivity = tp / (tp + fn + 1e-9)
    specificity = tn / (tn + fp + 1e-9)

    # ROC/AUC
    auc = roc_auc_score(y_true, y_prob)
    fpr, tpr, _ = roc_curve(y_true, y_prob)

    return {
        "cm": cm,
        "acc": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "sensitivity": sensitivity,
        "specificity": specificity,
        "auc": auc,
        "fpr": fpr,
        "tpr": tpr,
        "y_true": y_true,
        "y_prob": y_prob,
        "y_pred": y_pred
    }

# Eval des 3 modèles (attention : chaque modèle doit utiliser SON test_gen cohérent)
res_scratch = evaluate_model_on_test(model_scratch, test_gen_scratch)
res_vgg     = evaluate_model_on_test(model_vgg, test_gen_vgg)
res_resnet  = evaluate_model_on_test(model_resnet, test_gen_resnet)

def show_confusion(cm, title):
    plt.figure(figsize=(4,4))
    plt.imshow(cm)
    plt.title(title)
    plt.xlabel("Pred")
    plt.ylabel("True")
    for (i, j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha="center", va="center")
    plt.show()

show_confusion(res_scratch["cm"], "Confusion Matrix - Scratch")
show_confusion(res_vgg["cm"], "Confusion Matrix - VGG16")
show_confusion(res_resnet["cm"], "Confusion Matrix - ResNet50")

# ROC curves sur un seul plot (lisible)
plt.figure(figsize=(7,6))
plt.plot(res_scratch["fpr"], res_scratch["tpr"], label=f"Scratch (AUC={res_scratch['auc']:.3f})")
plt.plot(res_vgg["fpr"], res_vgg["tpr"], label=f"VGG16 (AUC={res_vgg['auc']:.3f})")
plt.plot(res_resnet["fpr"], res_resnet["tpr"], label=f"ResNet50 (AUC={res_resnet['auc']:.3f})")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves (Test)")
plt.legend()
plt.show()


# Interprétation des résultats

## Comparaison des 3 modèles
- **Scratch** : baseline simple. Souvent moins performant car il doit apprendre des features à partir de zéro.
- **VGG16** : bénéficie de features ImageNet ; si AUC/recall augmente, c’est attendu.
- **ResNet50** : architecture plus moderne, souvent meilleure généralisation.

## Points importants (clinique / prudence)
- Même avec une bonne accuracy, il faut regarder **recall/sensibilité** (rater une cellule infectée = grave).
- La **spécificité** est aussi importante (éviter faux positifs = éviter sur-diagnostic).

## Limites
- **Data leakage** : évité ici par split explicite train/val/test (copie fichiers).
- **Overfitting** : surveillé via dropout + augmentation + early stopping.
- **Déséquilibre de classes** : vérifier les volumes par classe ; si imbalance → class_weight.
- **Qualité des images / bruit** : peut limiter la performance.
- **Seuil 0.5** : peut être ajusté selon objectif (prioriser sensibilité vs spécificité).

## Améliorations possibles
- Ajuster le **seuil** pour maximiser sensibilité.
- Essayer un modèle plus léger (MobileNetV2) pour vitesse.
- Ajouter **class_weight** si imbalance.
- Plus d’augmentation (brightness/contrast) si dataset varié.
- Cross-validation (si demandé) ou ré-essais avec seeds.
