# Fine-tuning et Validation du Modèle Random Forest

Ce notebook se concentre sur:
1. Le chargement du modèle Random Forest entraîné précédemment
2. L'optimisation des hyperparamètres par validation croisée (GridSearchCV)
3. L'évaluation comparative des performances entre ensembles de validation et de test
4. L'analyse des erreurs et l'ajustement des biais potentiels

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import time
import pickle 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import GridSearchCV, cross_val_score

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

# On charge la sauvegarde du modèle

In [None]:

numpy_models_dir = 'numpy_models'

try:
    with open(os.path.join(numpy_models_dir, 'train_features.pkl'), 'rb') as f:
        train_features = pickle.load(f)
    with open(os.path.join(numpy_models_dir, 'train_labels.pkl'), 'rb') as f:
        train_labels = pickle.load(f)
    with open(os.path.join(numpy_models_dir, 'val_features.pkl'), 'rb') as f:
        val_features = pickle.load(f)
    with open(os.path.join(numpy_models_dir, 'val_labels.pkl'), 'rb') as f:
        val_labels = pickle.load(f)
    
    with open(os.path.join(numpy_models_dir, 'rf_model.pkl'), 'rb') as f:
        base_model = pickle.load(f)
    
    print(f"Données et modèle chargés avec succès depuis le dossier '{numpy_models_dir}'!")
    
    try:
        with open(os.path.join(numpy_models_dir, 'performance_metrics.pkl'), 'rb') as f:
            performance_metrics = pickle.load(f)
        print("Métriques de performance également chargées!")
    except FileNotFoundError:
        print("Fichier de métriques non trouvé (facultatif).")
        
except FileNotFoundError:
    print(f"Fichiers de sauvegarde non trouvés dans le dossier '{numpy_models_dir}'.")
    print("Assurez-vous d'avoir exécuté le notebook précédent et vérifié le chemin.")
try:
    print("\nDimensions des données:")
    print(f"Caractéristiques d'entraînement: {train_features.shape}")
    print(f"Étiquettes d'entraînement: {train_labels.shape}")
    print(f"Caractéristiques de validation: {val_features.shape}")
    print(f"Étiquettes de validation: {val_labels.shape}")
except NameError:
    print("Les données n'ont pas pu être chargées, impossible d'afficher les dimensions.")

# On re-évalue les performances

In [None]:
val_predictions = base_model.predict(val_features)
accuracy = accuracy_score(val_labels, val_predictions)
precision = precision_score(val_labels, val_predictions)
recall = recall_score(val_labels, val_predictions)
f1 = f1_score(val_labels, val_predictions)

print("\nPerformances du modèle de base sur l'ensemble de validation:")
print(f"Précision (accuracy): {accuracy:.4f}")
print(f"Précision (precision): {precision:.4f}")
print(f"Rappel (recall): {recall:.4f}")
print(f"Score F1: {f1:.4f}")

# On applique le fine-tuning via GridSearchCV

In [None]:
print("\nDébut du fine-tuning des hyperparamètres...")
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,               
    scoring='accuracy',
    n_jobs=-1           
)

start_time = time.time()
grid_search.fit(train_features, train_labels)
end_time = time.time()

print(f"Fine-tuning terminé en {end_time - start_time:.2f} secondes")
print(f"Meilleurs paramètres: {grid_search.best_params_}")
print(f"Meilleure précision (validation croisée): {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("Meilleur modèle sauvegardé!")

# On re évalue la data avec le model fine-tuné

In [None]:
with open('best_rf_model.pkl', 'rb') as f:
    best_model = pickle.load(f)
print("Modèle optimisé chargé avec succès!")

try:

    with open('train_features.pkl', 'rb') as f:
        train_features = pickle.load(f)
    with open('train_labels.pkl', 'rb') as f:
        train_labels = pickle.load(f)
    with open('val_features.pkl', 'rb') as f:
        val_features = pickle.load(f)
    with open('val_labels.pkl', 'rb') as f:
        val_labels = pickle.load(f)
    print("Caractéristiques train/val chargées avec succès!")
except FileNotFoundError:
    print("Caractéristiques non trouvées. Veuillez exécuter le code d'extraction.")

print("\nÉvaluation sur l'ensemble d'entraînement...")
train_predictions = best_model.predict(train_features)
train_accuracy = accuracy_score(train_labels, train_predictions)
train_precision = precision_score(train_labels, train_predictions)
train_recall = recall_score(train_labels, train_predictions)
train_f1 = f1_score(train_labels, train_predictions)

print("Évaluation sur l'ensemble de validation...")
val_predictions = best_model.predict(val_features)
val_accuracy = accuracy_score(val_labels, val_predictions)
val_precision = precision_score(val_labels, val_predictions)
val_recall = recall_score(val_labels, val_predictions)
val_f1 = f1_score(val_labels, val_predictions)

print("Extraction et évaluation sur l'ensemble de test...")
test_dir = os.path.join('preprocessed_chest_xray', 'test')

def load_and_extract_features(directory, max_samples=None):
    features = []
    labels = []
    
    for class_label, class_name in enumerate(['NORMAL', 'PNEUMONIA']):
        class_dir = os.path.join(directory, class_name)
        
        image_files = [f for f in os.listdir(class_dir) 
                      if f.lower().endswith(('.jpeg', '.jpg', '.png'))]
        
        if max_samples is not None:
            image_files = image_files[:max_samples]
        
        print(f"Chargement de {len(image_files)} images {class_name}...")
        
        for i, img_file in enumerate(image_files):
            if i > 0 and i % 100 == 0:
                print(f"  - {i}/{len(image_files)} images traitées...")
            
            img_path = os.path.join(class_dir, img_file)
            img = Image.open(img_path)
            img_array = np.array(img)
            
            mean_val = np.mean(img_array)
            std_val = np.std(img_array)
            min_val = np.min(img_array)
            max_val = np.max(img_array)
            median_val = np.median(img_array)
            
            h, w = img_array.shape
            h_mid, w_mid = h // 2, w // 2
            
            q1 = img_array[:h_mid, :w_mid]
            q1_mean, q1_std = np.mean(q1), np.std(q1)
            
            q2 = img_array[:h_mid, w_mid:]
            q2_mean, q2_std = np.mean(q2), np.std(q2)
            
            q3 = img_array[h_mid:, :w_mid]
            q3_mean, q3_std = np.mean(q3), np.std(q3)
            
            q4 = img_array[h_mid:, w_mid:]
            q4_mean, q4_std = np.mean(q4), np.std(q4)
            
            img_features = [
                mean_val, std_val, min_val, max_val, median_val,
                q1_mean, q1_std, q2_mean, q2_std,
                q3_mean, q3_std, q4_mean, q4_std
            ]
            
            features.append(img_features)
            labels.append(class_label)
    
    return np.array(features), np.array(labels)

try:
    with open('test_features.pkl', 'rb') as f:
        test_features = pickle.load(f)
    with open('test_labels.pkl', 'rb') as f:
        test_labels = pickle.load(f)
    print("Caractéristiques de test chargées avec succès!")
except FileNotFoundError:

    print("Extraction des caractéristiques de test...")
    test_features, test_labels = load_and_extract_features(test_dir)
    

    with open('test_features.pkl', 'wb') as f:
        pickle.dump(test_features, f)
    with open('test_labels.pkl', 'wb') as f:
        pickle.dump(test_labels, f)
    print("Caractéristiques de test extraites et sauvegardées!")

test_predictions = best_model.predict(test_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision = precision_score(test_labels, test_predictions)
test_recall = recall_score(test_labels, test_predictions)
test_f1 = f1_score(test_labels, test_predictions)

print("\n===== PERFORMANCES DU MODÈLE OPTIMISÉ =====")

print("\nEnsemble d'entraînement:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")

print("\nEnsemble de validation:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1 Score: {val_f1:.4f}")

print("\nEnsemble de test:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")

print("\nMatrices de confusion:")
print("Train:")
print(confusion_matrix(train_labels, train_predictions))
print("\nValidation:")
print(confusion_matrix(val_labels, val_predictions))
print("\nTest:")
print(confusion_matrix(test_labels, test_predictions))

print("\n===== ANALYSE DES ÉCARTS =====")
print(f"Écart accuracy train-val: {train_accuracy - val_accuracy:.4f}")
print(f"Écart accuracy train-test: {train_accuracy - test_accuracy:.4f}")
print(f"Écart accuracy val-test: {val_accuracy - test_accuracy:.4f}")

feature_names = [
    'Moyenne globale', 'Écart-type global', 'Min global', 'Max global', 'Médiane globale',
    'Moyenne Q1', 'Écart-type Q1', 'Moyenne Q2', 'Écart-type Q2', 
    'Moyenne Q3', 'Écart-type Q3', 'Moyenne Q4', 'Écart-type Q4'
]

feature_importances = best_model.feature_importances_
sorted_idx = np.argsort(feature_importances)[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importances)), feature_importances[sorted_idx])
plt.xticks(range(len(feature_importances)), [feature_names[i] for i in sorted_idx], rotation=90)
plt.title('Importance des caractéristiques')
plt.tight_layout()
plt.show()

print("\nTop 5 des caractéristiques les plus importantes:")
for i in range(5):
    print(f"{i+1}. {feature_names[sorted_idx[i]]}: {feature_importances[sorted_idx[i]]:.4f}")

# Surapprentisage détécté, (1.00 impossible), création de contraintes

In [None]:

print("\n===== TEST D'UN MODÈLE AVEC MOINS DE SURAPPRENTISSAGE =====")

better_model = RandomForestClassifier(
    n_estimators=100,     
    max_depth=10,         
    min_samples_leaf=5,   
    random_state=42
)

better_model.fit(train_features, train_labels)

better_train_preds = better_model.predict(train_features)
better_train_acc = accuracy_score(train_labels, better_train_preds)

better_val_preds = better_model.predict(val_features)
better_val_acc = accuracy_score(val_labels, better_val_preds)

better_test_preds = better_model.predict(test_features)
better_test_acc = accuracy_score(test_labels, better_test_preds)

print("\nPerformances du modèle moins sujet au surapprentissage:")
print(f"Accuracy Train: {better_train_acc:.4f}")
print(f"Accuracy Validation: {better_val_acc:.4f}")
print(f"Accuracy Test: {better_test_acc:.4f}")

print("\nÉcarts d'accuracy:")
print(f"Train-Val: {better_train_acc - better_val_acc:.4f}")
print(f"Train-Test: {better_train_acc - better_test_acc:.4f}")
print(f"Val-Test: {better_val_acc - better_test_acc:.4f}")

print("\nComparaison avec le modèle optimisé par GridSearchCV:")
print(f"Écart Train - Avant: 1.0000, Après: {better_train_acc:.4f}")
print(f"Écart Val - Avant: 0.8068, Après: {better_val_acc:.4f}")
print(f"Écart Test - Avant: 0.8236, Après: {better_test_acc:.4f}")
print(f"Réduction de l'écart train-test: {(1.0000 - 0.8236) - (better_train_acc - better_test_acc):.4f}")

better_test_conf = confusion_matrix(test_labels, better_test_preds)
print("\nNouvelle matrice de confusion (test):")
print(better_test_conf)

# Conclusion

Après avoir testé trois configurations de Random Forest pour la détection de pneumonie, nous constatons des performances similaires (environ 82% de précision) mais des comportements différents en termes de surapprentissage. Le modèle avec contraintes (profondeur limitée à 10 et minimum 5 échantillons par feuille) offre le meilleur équilibre entre performance et généralisation, avec un écart train-test réduit à 5.8% contre 17.6% pour le modèle optimisé par GridSearchCV. Cette expérience démontre l'importance de trouver un compromis entre ajustement aux données d'entraînement et capacité de généralisation, plutôt que de chercher uniquement à maximiser la précision.