In [299]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import struct
import warnings

from scipy.stats import entropy
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import clone

from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier, 
    AdaBoostClassifier
)
from sklearn.linear_model import (
    LogisticRegression, 
    RidgeClassifier, 
    ElasticNetCV
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import lightgbm




# Ignorer les warnings
warnings.filterwarnings("ignore")


OSError: dlopen(/Users/j.guertin/Library/Python/3.13/lib/python/site-packages/lightgbm/lib/lib_lightgbm.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib
  Referenced from: <D44045CD-B874-3A27-9A61-F131D99AACE4> /Users/j.guertin/Library/Python/3.13/lib/python/site-packages/lightgbm/lib/lib_lightgbm.dylib
  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file)

In [1]:
import pandas as pd

In [300]:
def generate_dataset(n_samples, n_features, n_classes, n_redundant,n_informative):
    return make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, 
                               n_redundant=n_redundant, n_clusters_per_class=1, random_state=42,n_informative=n_informative,flip_y=0.2,class_sep=0.5,shuffle=True)


In [301]:
def split_dataset(X, y, nb_labeled, test_ratio):
    limit = int(test_ratio * np.shape(X)[0])
    X_train, X_pool, y_train, y_pool = train_test_split(X, y, test_size=1 - nb_labeled / np.shape(X)[0], random_state=42)
    X_test, y_test = X_pool[:limit], y_pool[:limit]
    X_pool, y_pool = X_pool[limit:], y_pool[limit:]
    
    return X_train, X_pool, y_train, y_pool, X_test, y_test

In [302]:
# Fonction pour calculer l'incertitude
def calculate_uncertainty(probabilities, method="entropy"):
    if method == "entropy":
        return entropy(probabilities.T, base=2)
    elif method == "margin":
        sorted_probs = np.sort(probabilities, axis=1)
        return sorted_probs[:, -1] - sorted_probs[:, -2]
    elif method == "least_confident":
        return 1 - np.max(probabilities, axis=1)
    elif method == "random":
        return np.random.rand(probabilities.shape[0])
    else:
        raise ValueError("Méthode d'incertitude non reconnue")

In [303]:
def train_and_evaluate(X_train, y_train, X_test, y_test,model_class):
    model = model_class()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [304]:
def select_uncertain_samples(model, X_pool, method, batch_size):
    probabilities = model.predict_proba(X_pool)
    uncertainties = calculate_uncertainty(probabilities, method=method)
    if method == "margin":
        return np.argsort(uncertainties)[:batch_size]
    else:
        return np.argsort(uncertainties)[-batch_size:]

In [305]:
def update_labeled_unlabeled_sets(X_train, y_train, X_pool, y_pool, uncertain_indices):
    X_train = np.vstack((X_train, X_pool[uncertain_indices]))
    y_train = np.hstack((y_train, y_pool[uncertain_indices]))
    
    mask = np.ones(len(X_pool), dtype=bool)
    mask[uncertain_indices] = False
    X_pool, y_pool = X_pool[mask], y_pool[mask]
    
    return X_train, y_train, X_pool, y_pool

In [306]:
def hybrid_uncertainty(model, X_pool, w1=0.33, w2=0.33, w3=0.34):
    """
    Combine plusieurs mesures d'incertitude en une seule.
    
    Args:
        model: Le modèle de classification entraîné.
        X_pool: Les échantillons non labellisés.
        w1, w2, w3: Poids des trois mesures d'incertitude.
    
    Returns:
        Un tableau des scores d'incertitude combinés.
    """
    proba = model.predict_proba(X_pool)
    least_confident = 1 - np.max(proba, axis=1)
    
    sorted_proba = np.sort(proba, axis=1)
    margin = sorted_proba[:, -1] - sorted_proba[:, -2]
    
    entropy = -np.sum(proba * np.log(proba + 1e-10), axis=1)
    
    combined_score = w1 * least_confident + w2 * (1 - margin) + w3 * entropy
    return combined_score


In [307]:
def select_uncertain_samples_hybrid(model, X_pool, batch_size):
    """
    Sélectionne les échantillons les plus incertains selon la stratégie hybride.
    
    Args:
        model: Le modèle entraîné.
        X_pool: Les échantillons non labellisés.
        batch_size: Nombre d'échantillons à ajouter.
    
    Returns:
        Les indices des échantillons les plus incertains.
    """
    scores = hybrid_uncertainty(model, X_pool)
    return np.argsort(scores)[-batch_size:]  # Prendre les échantillons les plus incertains


In [308]:
def select_uncertain_samples_qbc(method, models, X_train, y_train, X_pool, batch_size):
    """
    Sélectionne les échantillons les plus incertains en utilisant le Query by Committee (QBC).

    Parameters:
    models : list
        Liste de modèles du comité
    X_pool : array-like, shape (n_samples, n_features)
        Les données non-labellisées
    batch_size : int
        Le nombre d'échantillons à sélectionner

    Returns:
    uncertain_indices : list
        Liste des indices des échantillons les plus incertains
    """

    # Entraîner chaque modèle du comité sur les données labellisées
    for model in models:
        model.fit(X_train, y_train)

    # Obtenir les prédictions de chaque modèle sur le pool de données non labellisées
    predictions = np.array([model.predict_proba(X_pool) for model in models])  # (n_models, n_samples, n_classes)

    if method == "qbc-variance":
        # Calcul de la variance des prédictions pour chaque échantillon
        uncertainty = np.var(predictions, axis=0)  # Calcul de la variance des probabilités
        uncertainty = np.mean(uncertainty, axis=1)  # Prendre la moyenne de la variance sur les classes

    elif method == "qbc-entropy":
        # Comptage des votes pour chaque classe
        n_models, n_samples, n_classes = predictions.shape
        vote_counts = np.zeros((n_samples, n_classes))

        for i in range(n_samples):
            for j in range(n_models):
                # Trouver la classe prédite par chaque modèle
                predicted_class = np.argmax(predictions[j, i, :])
                vote_counts[i, predicted_class] += 1

        # Calcul des probabilités de vote
        vote_probs = vote_counts / n_models

        # Calcul de l'entropie des votes pour chaque échantillon
        uncertainty = np.array([entropy(vote_probs[i, :], base=2) for i in range(n_samples)])

    elif method == "qbc-KL":
        # Calculer la probabilité moyenne (P_avg) pour chaque échantillon
        P_avg = np.mean(predictions, axis=0)  # Moyenne des prédictions des modèles (n_samples, n_classes)

        # Calculer la divergence KL pour chaque échantillon
        n_samples, n_classes = P_avg.shape
        uncertainty = np.zeros(n_samples)

        for i in range(n_samples):
            for model_preds in predictions[:, i, :]:
                uncertainty[i] += np.sum(model_preds * np.log(model_preds / P_avg[i, :]))

        # Sélectionner les indices des échantillons avec la plus grande divergence KL
        uncertain_indices = np.argsort(uncertainty)[-batch_size:]

    # Sélectionner les indices des échantillons avec la plus grande incertitude
    uncertain_indices = np.argsort(uncertainty)[-batch_size:]

    return uncertain_indices

In [309]:
#à terminer


def select_uncertain_samples_egl(model, X_pool, batch_size):
    """
    Sélectionne les échantillons les plus informatifs en utilisant la méthode EGL (Expected Gradient Length).
    """
    if not hasattr(model, "coef_"):
        raise ValueError("Le modèle doit être une régression logistique ou un autre modèle basé sur des gradients.")
    
    gradients = []
    
    # Obtenir les probabilités de prédiction
    probs = model.predict_proba(X_pool)  # Probabilités pour chaque classe
    for i, x in enumerate(X_pool):
        #x = x.reshape(1,-1)  # Mise en forme correcte de l'entrée
        expected_grad = 0

        for class_idx in range(probs.shape[1]):  # Parcourir chaque classe
            prob = probs[i, class_idx]
            y_dummy = np.zeros((1, probs.shape[1]))  # Créer un vecteur one-hot
            y_dummy[0, class_idx] = 1  # Placer la classe courante
            
            try:
                # Calcul du gradient pour cette classe
                model.fit(x, y_dummy.argmax(axis=1))  # Ajustement temporaire
                grad = np.linalg.norm(model.coef_)  # Norme du gradient
                print(grad)
                expected_grad += prob * grad  # Calcul de l'espérance
            except ValueError:
                # Si une erreur survient, on passe à la classe suivante
                continue
        
        gradients.append(expected_grad)
    
    # Sélection des indices des plus grandes valeurs de gradient
    uncertain_indices = np.argsort(gradients)[-batch_size:]
    return uncertain_indices


In [310]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_information_density(X_pool, similarity_metric='cosine'):
    """
    Calcule la densité d'information pour chaque point du pool en fonction de la similarité avec les autres points.
    
    Paramètres:
        X_pool (numpy.ndarray): Ensemble non labellisé de taille (n_samples, n_features).
        similarity_metric (str): Type de mesure de similarité ('cosine' ou 'euclidean').
    
    Retourne:
        numpy.ndarray: Score de densité pour chaque instance.
    """
    if similarity_metric == 'cosine':
        similarity_matrix = cosine_similarity(X_pool)
    elif similarity_metric == 'euclidean':
        distance_matrix = np.linalg.norm(X_pool[:, np.newaxis] - X_pool, axis=2)
        similarity_matrix = 1 / (1 + distance_matrix)  # Conversion en similarité
    else:
        raise ValueError("Metric non supportée. Utilisez 'cosine' ou 'euclidean'.")
    
    # Moyenne des similarités de chaque point avec tous les autres
    density_scores = np.mean(similarity_matrix, axis=1)
    print(density_scores)
    return density_scores

def select_uncertain_samples_density(model, X_pool, batch_size, similarity_metric='cosine'):
    """
    Sélectionne les échantillons en combinant incertitude et densité d'information.
    
    Paramètres:
        model (sklearn model): Modèle entraîné utilisé pour l'incertitude.
        X_pool (numpy.ndarray): Ensemble non labellisé de taille (n_samples, n_features).
        batch_size (int): Nombre d'échantillons à sélectionner.
        similarity_metric (str): Type de mesure de similarité ('cosine' ou 'euclidean').
    
    Retourne:
        list: Indices des échantillons sélectionnés.
    """
    # 1. Calcul de l'incertitude (ex: entropie prédictive)
    probas = model.predict_proba(X_pool)
    uncertainty = -np.sum(probas * np.log(probas + 1e-10), axis=1)  # Évite log(0)
    
    # 2. Calcul de la densité d'information
    density_scores = compute_information_density(X_pool, similarity_metric)
    
    # 3. Combinaison des scores : pondération entre incertitude et densité
    combined_scores = uncertainty * density_scores
    # 4. Sélection des indices les plus informatifs
    selected_indices = np.argsort(combined_scores)[-batch_size:]
    return selected_indices.tolist()


In [311]:
def run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, 
                                   batch_size, methods, model_class,models):
    #fig = px.scatter(x=X[:, 0], y=X[:, 1], color=y.astype(str), title="Dataset Initial",labels={"color": "Classe"})
    #fig.show()

    X_train, X_pool, y_train, y_pool, X_test, y_test = split_dataset(X, y, n_labeled, test_ratio)
    
    # Affichage des tailles des ensembles
    print(f"Taille de l'ensemble total: {np.shape(X)[0]}")
    print(f"Taille de l'ensemble labellisé: {np.shape(X_train)[0]}")
    print(f"Taille de l'ensemble non-labellisé: {np.shape(X_pool)[0]}")
    print(f"Taille de l'ensemble de test: {np.shape(X_test)[0]}")
    print(f"Nb d'itérations: {n_iterations}")
    print(f"Nb de données labellisées en plus à chaque itération: {batch_size}")

    # === Initialisation des variables ===
    accuracies = {method: [] for method in methods}
    final_accuracies = {}

    # Évaluation initiale
    model = model_class()
    y_pred_initial = model.fit(X_train, y_train).predict(X)
    initial_accuracy = accuracy_score(y, y_pred_initial)

    # === Boucle principale d'Active Learning ===
    for method in methods:
        X_train_temp, y_train_temp = X_train.copy(), y_train.copy()
        X_pool_temp, y_pool_temp = X_pool.copy(), y_pool.copy()
        for i in range(n_iterations):
            model, acc = train_and_evaluate(X_train_temp, y_train_temp, X_test, y_test, model_class)
            #print(np.shape(X_train_temp), np.shape(X_pool_temp), np.shape(y_pool_temp), np.shape(X_test), np.shape(y_test))
            accuracies[method].append(acc)
            if method in ["qbc-variance","qbc-entropy","qbc-KL"]:
                
                uncertain_indices = select_uncertain_samples_qbc(method,models,X_train_temp,y_train_temp, X_pool_temp, batch_size)
            # Entraînement et évaluation du modèle
            # Sélection des indices incertains en fonction de la méthode
            elif method == "hybrid":
                uncertain_indices = select_uncertain_samples_hybrid(model, X_pool_temp, batch_size) 
                
            elif method == "EGL":
                uncertain_indices = select_uncertain_samples_egl(model, X_pool_temp, batch_size)
            
            elif method in ["least_confident","margin","entropy"]:
                uncertain_indices = select_uncertain_samples(model, X_pool_temp, method, batch_size)
            elif method == "random":
                # Sélectionner des indices au hasard parmi les indices de l'ensemble non labellisé
                uncertain_indices = np.random.choice(len(X_pool_temp), size=batch_size, replace=False)
            elif method == "density":
                uncertain_indices= select_uncertain_samples_density(model, X_pool_temp, batch_size, similarity_metric='cosine')            
            
            # Mise à jour des ensembles labellisés et non-labellisés
            X_train_temp, y_train_temp, X_pool_temp, y_pool_temp = update_labeled_unlabeled_sets(
                X_train_temp, y_train_temp, X_pool_temp, y_pool_temp, uncertain_indices
            )
            
            print(f"{method} - Iteration {i+1}: {len(X_train_temp)} samples labeled, Accuracy: {acc:.4f}")
            
        # Calcul de la précision finale
        
        final_accuracies[method] = accuracy_score(y_test, model.predict(X_test))
        
        print(f"Final Accuracy ({method}): {final_accuracies[method]:.4f}")

    # === Affichage des résultats ===
    # 1. Graphique de l'évolution de l'accuracy
    fig = px.line(title="Évolution de l'accuracy au fil des itérations d'Active Learning")
    for method in methods:
        fig.add_scatter(x=list(range(1, n_iterations + 1)), y=accuracies[method], mode='lines+markers', name=method)
    fig.update_layout(xaxis_title="Iteration", yaxis_title="Accuracy")
    fig.show()

    # 2. Comparaison des accuracy finales
    fig = px.bar(x=methods, y=[final_accuracies[m] for m in methods], labels={"x": "Méthode", "y": "Final Accuracy"},
                 title="Comparaison des Accuracy Finales")
    fig.update_layout(yaxis=dict(range=[min(final_accuracies.values()) - 0.01, max(final_accuracies.values()) + 0.01]))
    fig.show()

    # 3. Comparaison de l'amélioration de l'accuracy
    accuracy_improvements = {method: final_accuracies[method] - initial_accuracy for method in methods}
    fig = px.bar(x=methods, y=[accuracy_improvements[m] for m in methods], labels={"x": "Méthode", "y": "Amélioration d'Accuracy"},
                 title="Amélioration de l'Accuracy sur tout le dataset")
    fig.update_layout(yaxis=dict(range=[min(accuracy_improvements.values()) - 0.01, max(accuracy_improvements.values()) + 0.01]))
    fig.show()

    print(f"Initial Accuracy: {initial_accuracy:.4f}")
    for method in methods:
        print(f"Accuracy Improvement ({method}): {accuracy_improvements[method]:.4f}")
        
    
    return final_accuracies


Sur MNIST

In [312]:
def read_images(filename):
    with open(filename, 'rb') as file:
        magic_number, nb_images, nb_rows, nb_cols = struct.unpack('>IIII', file.read(16))
        assert magic_number == 2051, "Wrong file"
        image_data = np.fromfile(file, dtype=np.uint8).reshape(nb_images, nb_rows, nb_cols)
    return image_data

def read_targets(filename):
    with open(filename, 'rb') as file:
        magic_number, nb_items = struct.unpack('>II', file.read(8))
        assert magic_number == 2049, "Wrong file"
        targets = np.fromfile(file, dtype=np.uint8)
    return targets

In [313]:
path_images_file = '../t10k-images.idx3-ubyte'
path_to_targets = '../t10k-labels.idx1-ubyte'

In [314]:
X = read_images(path_images_file).reshape(-1, 28*28)
y = read_targets(path_to_targets)

print(np.shape(X),np.shape(y))

(10000, 784) (10000,)


In [315]:

n_labeled = 600  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 50  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
model_class=lambda : RandomForestClassifier()


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class, models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 600
Taille de l'ensemble non-labellisé: 7400
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 50
random - Iteration 1: 650 samples labeled, Accuracy: 0.8680
random - Iteration 2: 700 samples labeled, Accuracy: 0.8730
random - Iteration 3: 750 samples labeled, Accuracy: 0.8765
random - Iteration 4: 800 samples labeled, Accuracy: 0.8815
random - Iteration 5: 850 samples labeled, Accuracy: 0.8730
random - Iteration 6: 900 samples labeled, Accuracy: 0.8900
random - Iteration 7: 950 samples labeled, Accuracy: 0.8915
random - Iteration 8: 1000 samples labeled, Accuracy: 0.8950
random - Iteration 9: 1050 samples labeled, Accuracy: 0.8980
random - Iteration 10: 1100 samples labeled, Accuracy: 0.9050
random - Iteration 11: 1150 samples labeled, Accuracy: 0.9020
random - Iteration 12: 1200 samples labeled, Accuracy: 0.9045
random - Iteration 13: 1250 samples labeled, Accu

Initial Accuracy: 0.8672
Accuracy Improvement (random): 0.0633
Accuracy Improvement (least_confident): 0.0883
Accuracy Improvement (margin): 0.0928
Accuracy Improvement (entropy): 0.0843
Accuracy Improvement (hybrid): 0.0878


{'random': 0.9305,
 'least_confident': 0.9555,
 'margin': 0.96,
 'entropy': 0.9515,
 'hybrid': 0.955}

In [323]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 10  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
model_class=lambda : RandomForestClassifier()


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 10
random - Iteration 1: 110 samples labeled, Accuracy: 0.7000
random - Iteration 2: 120 samples labeled, Accuracy: 0.7005
random - Iteration 3: 130 samples labeled, Accuracy: 0.7170
random - Iteration 4: 140 samples labeled, Accuracy: 0.7020
random - Iteration 5: 150 samples labeled, Accuracy: 0.7415
random - Iteration 6: 160 samples labeled, Accuracy: 0.7500
random - Iteration 7: 170 samples labeled, Accuracy: 0.7470
random - Iteration 8: 180 samples labeled, Accuracy: 0.7765
random - Iteration 9: 190 samples labeled, Accuracy: 0.7825
random - Iteration 10: 200 samples labeled, Accuracy: 0.7990
random - Iteration 11: 210 samples labeled, Accuracy: 0.8055
random - Iteration 12: 220 samples labeled, Accuracy: 0.8040
random - Iteration 13: 230 samples labeled, Accuracy: 

Initial Accuracy: 0.6732
Accuracy Improvement (random): 0.2038
Accuracy Improvement (least_confident): 0.2138
Accuracy Improvement (margin): 0.2398
Accuracy Improvement (entropy): 0.2098
Accuracy Improvement (hybrid): 0.2223


{'random': 0.877,
 'least_confident': 0.887,
 'margin': 0.913,
 'entropy': 0.883,
 'hybrid': 0.8955}

In [324]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 18  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
model_class=lambda : RandomForestClassifier()


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 18
random - Iteration 1: 118 samples labeled, Accuracy: 0.6820
random - Iteration 2: 136 samples labeled, Accuracy: 0.7030
random - Iteration 3: 154 samples labeled, Accuracy: 0.7555
random - Iteration 4: 172 samples labeled, Accuracy: 0.7600
random - Iteration 5: 190 samples labeled, Accuracy: 0.7710
random - Iteration 6: 208 samples labeled, Accuracy: 0.7965
random - Iteration 7: 226 samples labeled, Accuracy: 0.7950
random - Iteration 8: 244 samples labeled, Accuracy: 0.8045
random - Iteration 9: 262 samples labeled, Accuracy: 0.8285
random - Iteration 10: 280 samples labeled, Accuracy: 0.8375
random - Iteration 11: 298 samples labeled, Accuracy: 0.8355
random - Iteration 12: 316 samples labeled, Accuracy: 0.8425
random - Iteration 13: 334 samples labeled, Accuracy: 

Initial Accuracy: 0.6824
Accuracy Improvement (random): 0.2206
Accuracy Improvement (least_confident): 0.2386
Accuracy Improvement (margin): 0.2521
Accuracy Improvement (entropy): 0.2381
Accuracy Improvement (hybrid): 0.2471


{'random': 0.903,
 'least_confident': 0.921,
 'margin': 0.9345,
 'entropy': 0.9205,
 'hybrid': 0.9295}

In [318]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 18  # Taille du batch d'échantillons ajoutés à chaque itération
#methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
model_class=lambda : RandomForestClassifier()
# Créer un comité avec 3 modèles différents
# Liste de modèles pour le comité
models = [
    clone(RandomForestClassifier()),  # Random Forest
    clone(LogisticRegression(max_iter=1000)),                     # Régression Logistique
    clone(SVC(probability=True)),                   # Régression Ridge pour classificati)           # Analyse discriminante quadratique
]                # SVM avec probabilités
        # Analyse discriminante quadratique


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 18
random - Iteration 1: 118 samples labeled, Accuracy: 0.6855
random - Iteration 2: 136 samples labeled, Accuracy: 0.7240
random - Iteration 3: 154 samples labeled, Accuracy: 0.7455
random - Iteration 4: 172 samples labeled, Accuracy: 0.7920
random - Iteration 5: 190 samples labeled, Accuracy: 0.7945
random - Iteration 6: 208 samples labeled, Accuracy: 0.8080
random - Iteration 7: 226 samples labeled, Accuracy: 0.8180
random - Iteration 8: 244 samples labeled, Accuracy: 0.8120
random - Iteration 9: 262 samples labeled, Accuracy: 0.8185
random - Iteration 10: 280 samples labeled, Accuracy: 0.8205
random - Iteration 11: 298 samples labeled, Accuracy: 0.8365
random - Iteration 12: 316 samples labeled, Accuracy: 0.8465
random - Iteration 13: 334 samples labeled, Accuracy: 

Initial Accuracy: 0.6802
Accuracy Improvement (random): 0.2258
Accuracy Improvement (least_confident): 0.2333
Accuracy Improvement (margin): 0.2603
Accuracy Improvement (entropy): 0.2328
Accuracy Improvement (hybrid): 0.2493
Accuracy Improvement (qbc-variance): 0.2223
Accuracy Improvement (qbc-entropy): 0.2533
Accuracy Improvement (qbc-KL): 0.1848


{'random': 0.906,
 'least_confident': 0.9135,
 'margin': 0.9405,
 'entropy': 0.913,
 'hybrid': 0.9295,
 'qbc-variance': 0.9025,
 'qbc-entropy': 0.9335,
 'qbc-KL': 0.865}

In [319]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 18  # Taille du batch d'échantillons ajoutés à chaque itération
#methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
model_class=lambda : RandomForestClassifier()
# Créer un comité avec 3 modèles différents
# Liste de modèles pour le comité
models = [
    clone(RandomForestClassifier()),          # Random Forest
    clone(GradientBoostingClassifier()),      # Gradient Boosting
    clone(AdaBoostClassifier()),              # AdaBoost
    clone(LogisticRegression(max_iter=1000)), # Régression Logistique
    clone(SVC(probability=True)),             # Support Vector Classifier
    clone(KNeighborsClassifier()),            # K-Nearest Neighbors
    clone(GaussianNB()),                      # Naive Bayes
    clone(DecisionTreeClassifier()),          # Decision Tree
    clone(MLPClassifier(max_iter=1000))
]

# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 18
random - Iteration 1: 118 samples labeled, Accuracy: 0.6840
random - Iteration 2: 136 samples labeled, Accuracy: 0.7250
random - Iteration 3: 154 samples labeled, Accuracy: 0.7260
random - Iteration 4: 172 samples labeled, Accuracy: 0.7555
random - Iteration 5: 190 samples labeled, Accuracy: 0.7515
random - Iteration 6: 208 samples labeled, Accuracy: 0.7910
random - Iteration 7: 226 samples labeled, Accuracy: 0.8070
random - Iteration 8: 244 samples labeled, Accuracy: 0.7975
random - Iteration 9: 262 samples labeled, Accuracy: 0.8130
random - Iteration 10: 280 samples labeled, Accuracy: 0.8340
random - Iteration 11: 298 samples labeled, Accuracy: 0.8250
random - Iteration 12: 316 samples labeled, Accuracy: 0.8325
random - Iteration 13: 334 samples labeled, Accuracy: 

Initial Accuracy: 0.6734
Accuracy Improvement (random): 0.2281
Accuracy Improvement (least_confident): 0.2516
Accuracy Improvement (margin): 0.2676
Accuracy Improvement (entropy): 0.2361
Accuracy Improvement (hybrid): 0.2536
Accuracy Improvement (qbc-variance): 0.2086
Accuracy Improvement (qbc-entropy): 0.2546
Accuracy Improvement (qbc-KL): 0.2251


{'random': 0.9015,
 'least_confident': 0.925,
 'margin': 0.941,
 'entropy': 0.9095,
 'hybrid': 0.927,
 'qbc-variance': 0.882,
 'qbc-entropy': 0.928,
 'qbc-KL': 0.8985}

In [253]:
# Définir les paramètres
batch_sizes = [10, 20, 30, 40]  # Liste de tailles de batchs à tester
n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-entropy"] # Méthodes d'Active Learning
model_class = lambda : RandomForestClassifier()  # Classe du modèle à utiliser

# Créer un comité avec 3 modèles différents
models = [
    clone(RandomForestClassifier()),  # Random Forest
    clone(LogisticRegression(max_iter=1000)),  # Régression Logistique
    clone(SVC(probability=True)),  # SVM avec probabilités
]

# Dictionnaire pour stocker les résultats de l'accuracy par modèle et batch_size
results = {batch_size: {method: [] for method in methods} for batch_size in batch_sizes}

# Exécuter l'expérience sur chaque taille de batch
for batch_size in batch_sizes:
    print(f"Running experiment for batch size {batch_size}...")
    
    final_accuracies = run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, 
                                                      batch_size, methods, model_class, models)
    for method in methods:
        results[batch_size][method].append(final_accuracies[method])
        

# Préparer les données sous forme de DataFrame pour Plotly
data = []
for batch_size in batch_sizes:
    for method in methods:
        data.append({"Batch Size": batch_size, "Method": method, "Final Accuracy": results[batch_size][method][0]})

df = pd.DataFrame(data)

# Tracer les résultats avec un graphique en ligne
fig = px.line(df, 
              x="Batch Size", 
              y="Final Accuracy", 
              color="Method", 
              markers=True,
              title="Final Accuracy vs Batch Size for Different Methods")

fig.update_layout(xaxis_title="Batch Size", yaxis_title="Final Accuracy")
fig.show()

Running experiment for batch size 10...
Running experiment for batch size 20...
Running experiment for batch size 30...
Running experiment for batch size 40...


In [320]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50 # Nombre d'itérations d'Active Learning
batch_size = 18  # Taille du batch d'échantillons ajoutés à chaque itération
#methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
methods=["random", "least_confident", "margin", "entropy","density"]
model_class=lambda : RandomForestClassifier()
# Créer un comité avec 3 modèles différents
# Liste de modèles pour le comité
models = [
    clone(RandomForestClassifier()),  # Random Forest
    clone(LogisticRegression(max_iter=1000)),                     # Régression Logistique
    clone(SVC(probability=True)),                   # Régression Ridge pour classificati)           # Analyse discriminante quadratique
]                # SVM avec probabilités
        # Analyse discriminante quadratique


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 18
random - Iteration 1: 118 samples labeled, Accuracy: 0.7020
random - Iteration 2: 136 samples labeled, Accuracy: 0.7270
random - Iteration 3: 154 samples labeled, Accuracy: 0.7415
random - Iteration 4: 172 samples labeled, Accuracy: 0.7665
random - Iteration 5: 190 samples labeled, Accuracy: 0.7695
random - Iteration 6: 208 samples labeled, Accuracy: 0.7795
random - Iteration 7: 226 samples labeled, Accuracy: 0.7955
random - Iteration 8: 244 samples labeled, Accuracy: 0.7895
random - Iteration 9: 262 samples labeled, Accuracy: 0.8015
random - Iteration 10: 280 samples labeled, Accuracy: 0.8055
random - Iteration 11: 298 samples labeled, Accuracy: 0.8345
random - Iteration 12: 316 samples labeled, Accuracy: 0.8335
random - Iteration 13: 334 samples labeled, Accuracy: 

Initial Accuracy: 0.6723
Accuracy Improvement (random): 0.2172
Accuracy Improvement (least_confident): 0.2512
Accuracy Improvement (margin): 0.2627
Accuracy Improvement (entropy): 0.2417
Accuracy Improvement (density): 0.2092


{'random': 0.8895,
 'least_confident': 0.9235,
 'margin': 0.935,
 'entropy': 0.914,
 'density': 0.8815}

In [322]:

n_labeled = 2  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 998 # Nombre d'itérations d'Active Learning
batch_size = 1  # Taille du batch d'échantillons ajoutés à chaque itération
#methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
methods=[ "random", "least_confident", "margin", "entropy","hybrid"]
model_class=lambda : RandomForestClassifier()
# Créer un comité avec 3 modèles différents
# Liste de modèles pour le comité
models = [
    clone(RandomForestClassifier()),  # Random Forest
    clone(LogisticRegression(max_iter=1000)),                     # Régression Logistique
    clone(SVC(probability=True)),                   # Régression Ridge pour classificati)           # Analyse discriminante quadratique
]                # SVM avec probabilités
        # Analyse discriminante quadratique


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 2
Taille de l'ensemble non-labellisé: 7998
Taille de l'ensemble de test: 2000
Nb d'itérations: 998
Nb de données labellisées en plus à chaque itération: 1
random - Iteration 1: 3 samples labeled, Accuracy: 0.1255
random - Iteration 2: 4 samples labeled, Accuracy: 0.1335
random - Iteration 3: 5 samples labeled, Accuracy: 0.1500
random - Iteration 4: 6 samples labeled, Accuracy: 0.1370
random - Iteration 5: 7 samples labeled, Accuracy: 0.1315
random - Iteration 6: 8 samples labeled, Accuracy: 0.1685
random - Iteration 7: 9 samples labeled, Accuracy: 0.1830
random - Iteration 8: 10 samples labeled, Accuracy: 0.2315
random - Iteration 9: 11 samples labeled, Accuracy: 0.2265
random - Iteration 10: 12 samples labeled, Accuracy: 0.2765
random - Iteration 11: 13 samples labeled, Accuracy: 0.2765
random - Iteration 12: 14 samples labeled, Accuracy: 0.2815
random - Iteration 13: 15 samples labeled, Accuracy: 0.2900
random - Iterat

Initial Accuracy: 0.1601
Accuracy Improvement (random): 0.7449
Accuracy Improvement (least_confident): 0.7514
Accuracy Improvement (margin): 0.7769
Accuracy Improvement (entropy): 0.7529
Accuracy Improvement (hybrid): 0.7634


{'random': 0.905,
 'least_confident': 0.9115,
 'margin': 0.937,
 'entropy': 0.913,
 'hybrid': 0.9235}