# Caractéristiques avancées pour la détection de pneumonie

Constatant que le fine-tuning des hyperparamètres de Random Forest ne permet pas de dépasser 82% d'accuracy, nous explorons dans ce notebook l'extraction de caractéristiques avancées avec HOG (Histogram of Oriented Gradients) et LBP (Local Binary Patterns).

## Pourquoi ces descripteurs améliorent les performances

Contrairement aux statistiques simples (moyennes, écarts-types) qui ne capturent que des informations globales, HOG et LBP extraient:
- **Contours et structures** (HOG): détecte les gradients d'intensité révélateurs des opacités pneumoniques
- **Textures locales** (LBP): capture les motifs subtils des infiltrats et altérations tissulaires

La combinaison HOG+LBP offre une représentation complémentaire et bien plus riche des radiographies, permettant au même algorithme Random Forest d'atteindre des performances nettement supérieures.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import time
import pickle

from skimage.feature import hog, local_binary_pattern
from skimage import exposure

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')


# Vérification des dossiers

In [None]:
models_dir = 'models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"Dossier '{models_dir}' créé avec succès.")
else:
    print(f"Le dossier '{models_dir}' existe déjà.")
    
preprocessed_base = 'preprocessed_chest_xray'
train_dir = os.path.join(preprocessed_base, 'train')
val_dir = os.path.join(preprocessed_base, 'val')
test_dir = os.path.join(preprocessed_base, 'test')

for directory in [train_dir, val_dir, test_dir]:
    if not os.path.exists(directory):
        print(f"Attention: Le dossier {directory} n'existe pas!")
    else:
        normal_count = len(os.listdir(os.path.join(directory, 'NORMAL')))
        pneumonia_count = len(os.listdir(os.path.join(directory, 'PNEUMONIA')))
        print(f"{directory}: {normal_count} images normales, {pneumonia_count} images pneumonie")

# Définition des fonctions pour extraire les caractéristiques HOG et LBP

In [None]:
def extract_hog_features(img_array):
    """Extrait les caractéristiques HOG d'une image"""
    hog_features = hog(img_array, orientations=9, 
                       pixels_per_cell=(16, 16),
                       cells_per_block=(2, 2), 
                       block_norm='L2-Hys',
                       visualize=False,
                       transform_sqrt=True)
    return hog_features

def extract_lbp_features(img_array, P=24, R=3):
    """Extrait les caractéristiques LBP d'une image"""
    lbp = local_binary_pattern(img_array, P=P, R=R, method='uniform')
    n_bins = P + 2
    hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins), density=True)
    return hist

def load_and_extract_advanced_features(directory, feature_type='hog', max_samples=None):
    """
    Charge les images depuis un répertoire et extrait les caractéristiques HOG ou LBP.
    
    Paramètres:
    -----------
    directory : str
        Chemin vers le répertoire contenant les sous-dossiers 'NORMAL' et 'PNEUMONIA'
    feature_type : str
        Type de caractéristiques à extraire ('hog', 'lbp', ou 'both')
    max_samples : int ou None
        Nombre maximum d'échantillons à charger par classe (None pour tout charger)
    """
    features = []
    labels = []
    
    for class_label, class_name in enumerate(['NORMAL', 'PNEUMONIA']):
        class_dir = os.path.join(directory, class_name)
        

        image_files = [f for f in os.listdir(class_dir) 
                      if f.lower().endswith(('.jpeg', '.jpg', '.png'))]
        

        if max_samples is not None:
            image_files = image_files[:max_samples]
        
        print(f"Chargement de {len(image_files)} images {class_name}...")
        

        for i, img_file in enumerate(image_files):
    
            if i > 0 and i % 100 == 0:
                print(f"  - {i}/{len(image_files)} images traitées...")
            
    
            img_path = os.path.join(class_dir, img_file)
            img = Image.open(img_path)
            img_array = np.array(img)
            
    
            if feature_type == 'hog':
                img_features = extract_hog_features(img_array)
            elif feature_type == 'lbp':
                img_features = extract_lbp_features(img_array)
            elif feature_type == 'both':
                hog_features = extract_hog_features(img_array)
                lbp_features = extract_lbp_features(img_array)
                img_features = np.concatenate((hog_features, lbp_features))
            
    
            features.append(img_features)
            labels.append(class_label)
    
    return np.array(features), np.array(labels)

# Extraction et sauvegarde des caractéristiques HOG

In [None]:
print("\nExtraction des caractéristiques HOG...")
train_hog_features, train_labels = load_and_extract_advanced_features(train_dir, feature_type='hog')
val_hog_features, val_labels = load_and_extract_advanced_features(val_dir, feature_type='hog')
test_hog_features, test_labels = load_and_extract_advanced_features(test_dir, feature_type='hog')

print("\nDimensions des caractéristiques HOG:")
print(f"Train: {train_hog_features.shape}")
print(f"Validation: {val_hog_features.shape}")
print(f"Test: {test_hog_features.shape}")

with open(os.path.join(models_dir, 'train_hog_features.pkl'), 'wb') as f:
    pickle.dump(train_hog_features, f)
with open(os.path.join(models_dir, 'val_hog_features.pkl'), 'wb') as f:
    pickle.dump(val_hog_features, f)
with open(os.path.join(models_dir, 'test_hog_features.pkl'), 'wb') as f:
    pickle.dump(test_hog_features, f)
with open(os.path.join(models_dir, 'train_labels.pkl'), 'wb') as f:
    pickle.dump(train_labels, f)
with open(os.path.join(models_dir, 'val_labels.pkl'), 'wb') as f:
    pickle.dump(val_labels, f)
with open(os.path.join(models_dir, 'test_labels.pkl'), 'wb') as f:
    pickle.dump(test_labels, f)
print("Caractéristiques HOG sauvegardées avec succès dans le dossier 'models'!")

# Entraînement et évaluation du modèle HOG

In [None]:

print("\nEntraînement du modèle Random Forest avec caractéristiques HOG...")
start_time = time.time()

rf_hog_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,           
    min_samples_leaf=5,     
    random_state=42
)


rf_hog_model.fit(train_hog_features, train_labels)


training_time = time.time() - start_time
print(f"Modèle entraîné en {training_time:.2f} secondes")


print("\nÉvaluation du modèle avec caractéristiques HOG:")


train_predictions = rf_hog_model.predict(train_hog_features)
train_accuracy = accuracy_score(train_labels, train_predictions)
print(f"Accuracy Train: {train_accuracy:.4f}")


val_predictions = rf_hog_model.predict(val_hog_features)
val_accuracy = accuracy_score(val_labels, val_predictions)
val_precision = precision_score(val_labels, val_predictions)
val_recall = recall_score(val_labels, val_predictions)
val_f1 = f1_score(val_labels, val_predictions)

print(f"Accuracy Validation: {val_accuracy:.4f}")
print(f"Precision Validation: {val_precision:.4f}")
print(f"Recall Validation: {val_recall:.4f}")
print(f"F1 Score Validation: {val_f1:.4f}")


test_predictions = rf_hog_model.predict(test_hog_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision = precision_score(test_labels, test_predictions)
test_recall = recall_score(test_labels, test_predictions)
test_f1 = f1_score(test_labels, test_predictions)

print(f"Accuracy Test: {test_accuracy:.4f}")
print(f"Precision Test: {test_precision:.4f}")
print(f"Recall Test: {test_recall:.4f}")
print(f"F1 Score Test: {test_f1:.4f}")


print("\nMatrice de confusion (Test):")
conf_matrix = confusion_matrix(test_labels, test_predictions)
print(conf_matrix)


with open(os.path.join(models_dir, 'rf_hog_model.pkl'), 'wb') as f:
    pickle.dump(rf_hog_model, f)
print("Modèle HOG sauvegardé avec succès dans le dossier 'models'!")


hog_metrics = {
    'train_accuracy': train_accuracy,
    'val_accuracy': val_accuracy,
    'test_accuracy': test_accuracy,
    'test_precision': test_precision,
    'test_recall': test_recall,
    'test_f1': test_f1
}

# Extraction et évaluation des caractéristiques LBP

In [None]:

print("\nExtraction des caractéristiques LBP...")
train_lbp_features, _ = load_and_extract_advanced_features(train_dir, feature_type='lbp')
val_lbp_features, _ = load_and_extract_advanced_features(val_dir, feature_type='lbp')
test_lbp_features, _ = load_and_extract_advanced_features(test_dir, feature_type='lbp')

print("\nDimensions des caractéristiques LBP:")
print(f"Train: {train_lbp_features.shape}")
print(f"Validation: {val_lbp_features.shape}")
print(f"Test: {test_lbp_features.shape}")

with open(os.path.join(models_dir, 'train_lbp_features.pkl'), 'wb') as f:
    pickle.dump(train_lbp_features, f)
with open(os.path.join(models_dir, 'val_lbp_features.pkl'), 'wb') as f:
    pickle.dump(val_lbp_features, f)
with open(os.path.join(models_dir, 'test_lbp_features.pkl'), 'wb') as f:
    pickle.dump(test_lbp_features, f)
print("Caractéristiques LBP sauvegardées avec succès dans le dossier 'models'!")

print("\nEntraînement du modèle Random Forest avec caractéristiques LBP...")
start_time = time.time()

rf_lbp_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)

rf_lbp_model.fit(train_lbp_features, train_labels)

training_time = time.time() - start_time
print(f"Modèle entraîné en {training_time:.2f} secondes")

print("\nÉvaluation du modèle avec caractéristiques LBP:")

train_predictions = rf_lbp_model.predict(train_lbp_features)
train_accuracy_lbp = accuracy_score(train_labels, train_predictions)
print(f"Accuracy Train: {train_accuracy_lbp:.4f}")

val_predictions = rf_lbp_model.predict(val_lbp_features)
val_accuracy_lbp = accuracy_score(val_labels, val_predictions)
val_precision_lbp = precision_score(val_labels, val_predictions)
val_recall_lbp = recall_score(val_labels, val_predictions)
val_f1_lbp = f1_score(val_labels, val_predictions)

print(f"Accuracy Validation: {val_accuracy_lbp:.4f}")
print(f"Precision Validation: {val_precision_lbp:.4f}")
print(f"Recall Validation: {val_recall_lbp:.4f}")
print(f"F1 Score Validation: {val_f1_lbp:.4f}")

test_predictions = rf_lbp_model.predict(test_lbp_features)
test_accuracy_lbp = accuracy_score(test_labels, test_predictions)
test_precision_lbp = precision_score(test_labels, test_predictions)
test_recall_lbp = recall_score(test_labels, test_predictions)
test_f1_lbp = f1_score(test_labels, test_predictions)

print(f"Accuracy Test: {test_accuracy_lbp:.4f}")
print(f"Precision Test: {test_precision_lbp:.4f}")
print(f"Recall Test: {test_recall_lbp:.4f}")
print(f"F1 Score Test: {test_f1_lbp:.4f}")

print("\nMatrice de confusion (Test):")
conf_matrix = confusion_matrix(test_labels, test_predictions)
print(conf_matrix)

with open(os.path.join(models_dir, 'rf_lbp_model.pkl'), 'wb') as f:
    pickle.dump(rf_lbp_model, f)
print("Modèle LBP sauvegardé avec succès dans le dossier 'models'!")

lbp_metrics = {
    'train_accuracy': train_accuracy_lbp,
    'val_accuracy': val_accuracy_lbp,
    'test_accuracy': test_accuracy_lbp,
    'test_precision': test_precision_lbp,
    'test_recall': test_recall_lbp,
    'test_f1': test_f1_lbp
}

# Extraction et évaluation des caractéristiques combinées HOG+LBP


In [None]:

print("\nExtraction des caractéristiques combinées HOG+LBP...")
train_combined_features, _ = load_and_extract_advanced_features(train_dir, feature_type='both')
val_combined_features, _ = load_and_extract_advanced_features(val_dir, feature_type='both')
test_combined_features, _ = load_and_extract_advanced_features(test_dir, feature_type='both')

print("\nDimensions des caractéristiques combinées:")
print(f"Train: {train_combined_features.shape}")
print(f"Validation: {val_combined_features.shape}")
print(f"Test: {test_combined_features.shape}")

with open(os.path.join(models_dir, 'train_combined_features.pkl'), 'wb') as f:
    pickle.dump(train_combined_features, f)
with open(os.path.join(models_dir, 'val_combined_features.pkl'), 'wb') as f:
    pickle.dump(val_combined_features, f)
with open(os.path.join(models_dir, 'test_combined_features.pkl'), 'wb') as f:
    pickle.dump(test_combined_features, f)
print("Caractéristiques combinées sauvegardées avec succès dans le dossier 'models'!")

print("\nEntraînement du modèle Random Forest avec caractéristiques combinées HOG+LBP...")
start_time = time.time()

rf_combined_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)

rf_combined_model.fit(train_combined_features, train_labels)

training_time = time.time() - start_time
print(f"Modèle entraîné en {training_time:.2f} secondes")

print("\nÉvaluation du modèle avec caractéristiques combinées HOG+LBP:")

train_predictions = rf_combined_model.predict(train_combined_features)
train_accuracy_combined = accuracy_score(train_labels, train_predictions)
print(f"Accuracy Train: {train_accuracy_combined:.4f}")

val_predictions = rf_combined_model.predict(val_combined_features)
val_accuracy_combined = accuracy_score(val_labels, val_predictions)
val_precision_combined = precision_score(val_labels, val_predictions)
val_recall_combined = recall_score(val_labels, val_predictions)
val_f1_combined = f1_score(val_labels, val_predictions)

print(f"Accuracy Validation: {val_accuracy_combined:.4f}")
print(f"Precision Validation: {val_precision_combined:.4f}")
print(f"Recall Validation: {val_recall_combined:.4f}")
print(f"F1 Score Validation: {val_f1_combined:.4f}")

test_predictions = rf_combined_model.predict(test_combined_features)
test_accuracy_combined = accuracy_score(test_labels, test_predictions)
test_precision_combined = precision_score(test_labels, test_predictions)
test_recall_combined = recall_score(test_labels, test_predictions)
test_f1_combined = f1_score(test_labels, test_predictions)

print(f"Accuracy Test: {test_accuracy_combined:.4f}")
print(f"Precision Test: {test_precision_combined:.4f}")
print(f"Recall Test: {test_recall_combined:.4f}")
print(f"F1 Score Test: {test_f1_combined:.4f}")

print("\nMatrice de confusion (Test):")
conf_matrix = confusion_matrix(test_labels, test_predictions)
print(conf_matrix)

with open(os.path.join(models_dir, 'rf_combined_model.pkl'), 'wb') as f:
    pickle.dump(rf_combined_model, f)
print("Modèle combiné sauvegardé avec succès dans le dossier 'models'!")

combined_metrics = {
    'train_accuracy': train_accuracy_combined,
    'val_accuracy': val_accuracy_combined, 
    'test_accuracy': test_accuracy_combined,
    'test_precision': test_precision_combined,
    'test_recall': test_recall_combined,
    'test_f1': test_f1_combined
}

# Comparaison des modèles et conclusion

In [None]:
print("\n===== COMPARAISON DES MODÈLES =====")
models = ['Statistiques (Notebook 4)', 'HOG', 'LBP', 'HOG + LBP']

previous_model_train = 0.8773
previous_model_val = 0.7986
previous_model_test = 0.8189

comparison_data = {
    'Modèle': models,
    'Accuracy Train': [
        previous_model_train, 
        hog_metrics['train_accuracy'], 
        lbp_metrics['train_accuracy'], 
        combined_metrics['train_accuracy']
    ],
    'Accuracy Validation': [
        previous_model_val,
        hog_metrics['val_accuracy'],
        lbp_metrics['val_accuracy'],
        combined_metrics['val_accuracy']
    ],
    'Accuracy Test': [
        previous_model_test, 
        hog_metrics['test_accuracy'], 
        lbp_metrics['test_accuracy'], 
        combined_metrics['test_accuracy']
    ],
    'Precision Test': [
        'N/A',
        f"{hog_metrics['test_precision']:.4f}",
        f"{lbp_metrics['test_precision']:.4f}",
        f"{combined_metrics['test_precision']:.4f}"
    ],
    'Recall Test': [
        'N/A',
        f"{hog_metrics['test_recall']:.4f}",
        f"{lbp_metrics['test_recall']:.4f}",
        f"{combined_metrics['test_recall']:.4f}"
    ],
    'F1 Score Test': [
        'N/A',
        f"{hog_metrics['test_f1']:.4f}",
        f"{lbp_metrics['test_f1']:.4f}",
        f"{combined_metrics['test_f1']:.4f}"
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df)

comparison_df.to_csv(os.path.join(models_dir, 'model_comparison.csv'), index=False)
print(f"Résultats de comparaison sauvegardés dans {os.path.join(models_dir, 'model_comparison.csv')}")

plt.figure(figsize=(12, 7))
bar_width = 0.25
index = np.arange(len(models))

plt.bar(index, comparison_df['Accuracy Train'], bar_width, label='Train', color='blue', alpha=0.7)
plt.bar(index + bar_width, comparison_df['Accuracy Validation'], bar_width, label='Validation', color='green', alpha=0.7)
plt.bar(index + 2*bar_width, comparison_df['Accuracy Test'], bar_width, label='Test', color='red', alpha=0.7)

plt.xlabel('Modèle', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Comparaison des performances des différentes approches', fontsize=14)
plt.xticks(index + bar_width, models, rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.savefig(os.path.join(models_dir, 'model_comparison.png'), dpi=300, bbox_inches='tight')
plt.show()

print("\n===== CONCLUSION =====")
best_model = 'HOG+LBP'
improvement = (combined_metrics['test_accuracy'] - previous_model_test) * 100