# Développement d’un Pipeline pour la Classification d’Images Microscopiques de Coloration Gram

Pipeline combinant traitement d’images (OpenCV) et modèles automatisés (SVM, Random Forest, Arbre de Décision, CNN) pour classifier Gram+ vs Gram-.
Dataset: DIBaS (660 images, 33 espèces).
Technologies: Python, OpenCV, TensorFlow/Keras, scikit-learn.

Ce notebook est complet et autonome. Exécutez les cellules une par une dans Google Colab (recommandé pour le GPU et Drive).

In [None]:
!pip install opencv-python-headless tensorflow numpy matplotlib scikit-learn scikit-image

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir -p /content/dibas_data/gram_positive /content/dibas_data/gram_negative

In [None]:
import os

# Liste exacte des espèces DIBaS avec URLs corrigées et classification Gram
# Ignore Candida albicans
species_list = [
    {"species_url": "Acinetobacter.baumanii", "gram_type": "gram_negative"},
    {"species_url": "Actinomyces.israelii", "gram_type": "gram_positive"},
    {"species_url": "Bacteroides.fragilis", "gram_type": "gram_negative"},
    {"species_url": "Bifidobacterium.spp", "gram_type": "gram_positive"},
    {"species_url": "Clostridium.perfringens", "gram_type": "gram_positive"},
    {"species_url": "Enterococcus.faecium", "gram_type": "gram_positive"},
    {"species_url": "Enterococcus.faecalis", "gram_type": "gram_positive"},
    {"species_url": "Escherichia.coli", "gram_type": "gram_negative"},
    {"species_url": "Fusobacterium", "gram_type": "gram_negative"},
    {"species_url": "Lactobacillus.casei", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.crispatus", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.delbrueckii", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.gasseri", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.jensenii", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.johnsonii", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.paracasei", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.plantarum", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.reuteri", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.rhamnosus", "gram_type": "gram_positive"},
    {"species_url": "Lactobacillus.salivarius", "gram_type": "gram_positive"},
    {"species_url": "Listeria.monocytogenes", "gram_type": "gram_positive"},
    {"species_url": "Micrococcus", "gram_type": "gram_positive"},
    {"species_url": "Neisseria.gonorrhoeae", "gram_type": "gram_negative"},
    {"species_url": "Porphyromonas.gingivalis", "gram_type": "gram_negative"},
    {"species_url": "Propionibacterium.acnes", "gram_type": "gram_positive"},
    {"species_url": "Proteus", "gram_type": "gram_negative"},
    {"species_url": "Pseudomonas.aeruginosa", "gram_type": "gram_negative"},
    {"species_url": "Staphylococcus.aureus", "gram_type": "gram_positive"},
    {"species_url": "Staphylococcus.epidermidis", "gram_type": "gram_positive"},
    {"species_url": "Staphylococcus.saprophyticus", "gram_type": "gram_positive"},
    {"species_url": "Streptococcus.agalactiae", "gram_type": "gram_positive"},
    {"species_url": "Veillonella", "gram_type": "gram_negative"}
]

base_url = "https://doctoral.matinf.uj.edu.pl/database/dibas/"
for sp in species_list:
    zip_path = f"/tmp/{sp['species_url']}.zip"
    !wget --no-check-certificate {base_url}{sp['species_url']}.zip -O {zip_path}
    !unzip -q {zip_path} -d /content/dibas_data/{sp['gram_type']}/
    !rm {zip_path}

!find /content/dibas_data -name "*.tif" | wc -l

## Visualisation du Dataset

Affichage d'exemples d'images et distribution des classes.

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import random

positive_paths = glob('/content/dibas_data/gram_positive/*.tif')
negative_paths = glob('/content/dibas_data/gram_negative/*.tif')

print(f"Nombre Gram+ : {len(positive_paths)}")
print(f"Nombre Gram- : {len(negative_paths)}")

def show_images(paths, title):
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    for i in range(3):
        img_path = random.choice(paths)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axs[i].imshow(img)
        axs[i].set_title(f"{title} - Ex {i+1}")
        axs[i].axis('off')
    plt.show()

show_images(positive_paths, "Gram Positif (Violet)")
show_images(negative_paths, "Gram Négatif (Rose)")

# Distribution
labels = ['Gram Positif', 'Gram Négatif']
sizes = [len(positive_paths), len(negative_paths)]
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('Distribution des Classes')
plt.show()

In [None]:
import os
from skimage.feature import graycomatrix, graycoprops
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Chargement et prétraitement des images
def load_and_preprocess_images(data_dir, img_size=(128, 128)):
    images = []
    labels = []  # 0: neg, 1: pos
    for label, category in enumerate(['gram_negative', 'gram_positive']):
        path = os.path.join(data_dir, category)
        for img_name in os.listdir(path):
            img_path = os.path.join(path, img_name)
            img = cv2.imread(img_path)
            if img is not None:
                img = cv2.resize(img, img_size)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
                img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
                img_enh = clahe.apply(img_gray)
                img = cv2.cvtColor(img_enh, cv2.COLOR_GRAY2RGB)
                img_norm = img / 255.0
                images.append(img_norm)
                labels.append(label)
    return np.array(images), np.array(labels)

data_dir = '/content/dibas_data'
X, y = load_and_preprocess_images(data_dir)

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Augmentation pour CNN
datagen = ImageDataGenerator(rotation_range=20, width_shift_range=0.2, height_shift_range=0.2,
                              shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest')

# Extraction de features pour ML (HSV + GLCM)
def extract_features(img_rgb, bins=32):
    img_rgb = (img_rgb * 255).astype(np.uint8)  # Retour à uint8 pour histo
    img_hsv = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2HSV)
    hist_h = cv2.calcHist([img_hsv[:,:,0]], [0], None, [bins], [0, 180])
    hist_s = cv2.calcHist([img_hsv[:,:,1]], [0], None, [bins], [0, 256])
    hist_v = cv2.calcHist([img_hsv[:,:,2]], [0], None, [bins], [0, 256])
    color_features = np.concatenate([hist_h.flatten(), hist_s.flatten(), hist_v.flatten()])
    
    img_gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
    glcm = graycomatrix(img_gray, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
    texture_features = np.hstack([
        graycoprops(glcm, 'contrast').ravel(),
        graycoprops(glcm, 'energy').ravel(),
        graycoprops(glcm, 'homogeneity').ravel(),
        graycoprops(glcm, 'dissimilarity').ravel()
    ])
    
    return np.concatenate([color_features, texture_features])

X_features = np.array([extract_features(img) for img in X])
X_train_feat, X_test_feat, _, _ = train_test_split(X_features, y, test_size=0.2, random_state=42)
print(f"Shape des features pour ML: {X_features.shape}")

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# SVM
svm_model = SVC(kernel='rbf', C=1.0, random_state=42)
svm_model.fit(X_train_feat, y_train)
y_pred_svm = svm_model.predict(X_test_feat)
acc_svm = accuracy_score(y_test, y_pred_svm)
print(f"Précision SVM : {acc_svm:.2f}")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_feat, y_train)
y_pred_rf = rf_model.predict(X_test_feat)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Précision Random Forest : {acc_rf:.2f}")

# Arbre de Décision
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_feat, y_train)
y_pred_dt = dt_model.predict(X_test_feat)
acc_dt = accuracy_score(y_test, y_pred_dt)
print(f"Précision Arbre de Décision : {acc_dt:.2f}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

history_cnn = cnn_model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_test, y_test),
    epochs=20
)

test_loss_cnn, acc_cnn = cnn_model.evaluate(X_test, y_test)
print(f"Précision CNN : {acc_cnn:.2f}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

results = {
    'Modèle': ['SVM', 'Random Forest', 'Arbre de Décision', 'CNN'],
    'Précision Test': [acc_svm, acc_rf, acc_dt, acc_cnn]
}
df = pd.DataFrame(results)
display(df)  # Affiche comme table

# Graphique
plt.bar(df['Modèle'], df['Précision Test'], color=['blue', 'green', 'red', 'purple'])
plt.title('Comparaison des Précisions')
plt.ylabel('Précision')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.show()

In [None]:
import joblib

joblib.dump(svm_model, '/content/drive/MyDrive/svm_gram.pkl')
joblib.dump(rf_model, '/content/drive/MyDrive/rf_gram.pkl')
joblib.dump(dt_model, '/content/drive/MyDrive/dt_gram.pkl')
cnn_model.save('/content/drive/MyDrive/cnn_gram.h5')

print("Modèles sauvegardés sur Google Drive!")

## Conclusion

Ce pipeline est complet pour votre projet. Les modèles ML sont rapides et interprétables, le CNN capture des patterns complexes. Ajustez les hyperparamètres si besoin (e.g., plus d'epochs pour CNN). Précisions attendues: CNN ~85-90%, ML ~70-85%.