In [123]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.stats import entropy
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import clone
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import struct
from scipy.stats import entropy
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, ElasticNetCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


# Ignorer les warnings
warnings.filterwarnings("ignore")


In [2]:
def generate_dataset(n_samples, n_features, n_classes, n_redundant,n_informative):
    return make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, 
                               n_redundant=n_redundant, n_clusters_per_class=1, random_state=42,n_informative=n_informative,flip_y=0.2,class_sep=0.5,shuffle=True)




In [3]:
def split_dataset(X, y, nb_labeled, test_ratio):
    limit = int(test_ratio * np.shape(X)[0])
    X_train, X_pool, y_train, y_pool = train_test_split(X, y, test_size=1 - nb_labeled / np.shape(X)[0], random_state=42)
    X_test, y_test = X_pool[:limit], y_pool[:limit]
    X_pool, y_pool = X_pool[limit:], y_pool[limit:]
    
    return X_train, X_pool, y_train, y_pool, X_test, y_test

In [4]:
# Fonction pour calculer l'incertitude
def calculate_uncertainty(probabilities, method="entropy"):
    if method == "entropy":
        return entropy(probabilities.T, base=2)
    elif method == "margin":
        sorted_probs = np.sort(probabilities, axis=1)
        return sorted_probs[:, -1] - sorted_probs[:, -2]
    elif method == "least_confident":
        return 1 - np.max(probabilities, axis=1)
    elif method == "random":
        return np.random.rand(probabilities.shape[0])
    else:
        raise ValueError("Méthode d'incertitude non reconnue")

In [5]:
def train_and_evaluate(X_train, y_train, X_test, y_test,model_class):
    model = model_class()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [6]:
def select_uncertain_samples(model, X_pool, method, batch_size):
    probabilities = model.predict_proba(X_pool)
    uncertainties = calculate_uncertainty(probabilities, method=method)
    if method == "margin":
        return np.argsort(uncertainties)[:batch_size]
    else:
        return np.argsort(uncertainties)[-batch_size:]

In [7]:
def update_labeled_unlabeled_sets(X_train, y_train, X_pool, y_pool, uncertain_indices):
    X_train = np.vstack((X_train, X_pool[uncertain_indices]))
    y_train = np.hstack((y_train, y_pool[uncertain_indices]))
    
    mask = np.ones(len(X_pool), dtype=bool)
    mask[uncertain_indices] = False
    X_pool, y_pool = X_pool[mask], y_pool[mask]
    
    return X_train, y_train, X_pool, y_pool

In [8]:
def hybrid_uncertainty(model, X_pool, w1=0.33, w2=0.33, w3=0.34):
    """
    Combine plusieurs mesures d'incertitude en une seule.
    
    Args:
        model: Le modèle de classification entraîné.
        X_pool: Les échantillons non labellisés.
        w1, w2, w3: Poids des trois mesures d'incertitude.
    
    Returns:
        Un tableau des scores d'incertitude combinés.
    """
    proba = model.predict_proba(X_pool)
    least_confident = 1 - np.max(proba, axis=1)
    
    sorted_proba = np.sort(proba, axis=1)
    margin = sorted_proba[:, -1] - sorted_proba[:, -2]
    
    entropy = -np.sum(proba * np.log(proba + 1e-10), axis=1)
    
    combined_score = w1 * least_confident + w2 * (1 - margin) + w3 * entropy
    return combined_score


In [9]:
def select_uncertain_samples_hybrid(model, X_pool, batch_size):
    """
    Sélectionne les échantillons les plus incertains selon la stratégie hybride.
    
    Args:
        model: Le modèle entraîné.
        X_pool: Les échantillons non labellisés.
        batch_size: Nombre d'échantillons à ajouter.
    
    Returns:
        Les indices des échantillons les plus incertains.
    """
    scores = hybrid_uncertainty(model, X_pool)
    return np.argsort(scores)[-batch_size:]  # Prendre les échantillons les plus incertains


In [111]:
def select_uncertain_samples_qbc(method, models, X_train, y_train, X_pool, batch_size):
    """
    Sélectionne les échantillons les plus incertains en utilisant le Query by Committee (QBC).

    Parameters:
    models : list
        Liste de modèles du comité
    X_pool : array-like, shape (n_samples, n_features)
        Les données non-labellisées
    batch_size : int
        Le nombre d'échantillons à sélectionner

    Returns:
    uncertain_indices : list
        Liste des indices des échantillons les plus incertains
    """

    # Entraîner chaque modèle du comité sur les données labellisées
    for model in models:
        model.fit(X_train, y_train)

    # Obtenir les prédictions de chaque modèle sur le pool de données non labellisées
    predictions = np.array([model.predict_proba(X_pool) for model in models])  # (n_models, n_samples, n_classes)

    if method == "qbc-variance":
        # Calcul de la variance des prédictions pour chaque échantillon
        uncertainty = np.var(predictions, axis=0)  # Calcul de la variance des probabilités
        uncertainty = np.mean(uncertainty, axis=1)  # Prendre la moyenne de la variance sur les classes

    elif method == "qbc-entropy":
        # Comptage des votes pour chaque classe
        n_models, n_samples, n_classes = predictions.shape
        vote_counts = np.zeros((n_samples, n_classes))

        for i in range(n_samples):
            for j in range(n_models):
                # Trouver la classe prédite par chaque modèle
                predicted_class = np.argmax(predictions[j, i, :])
                vote_counts[i, predicted_class] += 1

        # Calcul des probabilités de vote
        vote_probs = vote_counts / n_models

        # Calcul de l'entropie des votes pour chaque échantillon
        uncertainty = np.array([entropy(vote_probs[i, :], base=2) for i in range(n_samples)])

    elif method == "qbc-KL":
        # Calculer la probabilité moyenne (P_avg) pour chaque échantillon
        P_avg = np.mean(predictions, axis=0)  # Moyenne des prédictions des modèles (n_samples, n_classes)

        # Calculer la divergence KL pour chaque échantillon
        n_samples, n_classes = P_avg.shape
        uncertainty = np.zeros(n_samples)

        for i in range(n_samples):
            for model_preds in predictions[:, i, :]:
                uncertainty[i] += np.sum(model_preds * np.log(model_preds / P_avg[i, :]))

        # Sélectionner les indices des échantillons avec la plus grande divergence KL
        uncertain_indices = np.argsort(uncertainty)[-batch_size:]

    # Sélectionner les indices des échantillons avec la plus grande incertitude
    uncertain_indices = np.argsort(uncertainty)[-batch_size:]

    return uncertain_indices

In [119]:
def run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, 
                                   batch_size, methods, model_class,models):
    #fig = px.scatter(x=X[:, 0], y=X[:, 1], color=y.astype(str), title="Dataset Initial",labels={"color": "Classe"})
    #fig.show()

    X_train, X_pool, y_train, y_pool, X_test, y_test = split_dataset(X, y, n_labeled, test_ratio)

    # Affichage des tailles des ensembles
    print(f"Taille de l'ensemble total: {np.shape(X)[0]}")
    print(f"Taille de l'ensemble labellisé: {np.shape(X_train)[0]}")
    print(f"Taille de l'ensemble non-labellisé: {np.shape(X_pool)[0]}")
    print(f"Taille de l'ensemble de test: {np.shape(X_test)[0]}")
    print(f"Nb d'itérations: {n_iterations}")
    print(f"Nb de données labellisées en plus à chaque itération: {batch_size}")

    # === Initialisation des variables ===
    accuracies = {method: [] for method in methods}
    final_accuracies = {}

    # Évaluation initiale
    model = model_class()
    y_pred_initial = model.fit(X_train, y_train).predict(X)
    initial_accuracy = accuracy_score(y, y_pred_initial)

    # === Boucle principale d'Active Learning ===
    for method in methods:
        X_train_temp, y_train_temp = X_train.copy(), y_train.copy()
        X_pool_temp, y_pool_temp = X_pool.copy(), y_pool.copy()
        for i in range(n_iterations):
            
            model, acc = train_and_evaluate(X_train_temp, y_train_temp, X_test, y_test, model_class)
            accuracies[method].append(acc)
            if method in ["qbc-variance","qbc-entropy","qbc-KL"]:
                
                uncertain_indices = select_uncertain_samples_qbc(method,models,X_train_temp,y_train_temp, X_pool_temp, batch_size)
            # Entraînement et évaluation du modèle
            # Sélection des indices incertains en fonction de la méthode
            elif method == "hybrid":
                uncertain_indices = select_uncertain_samples_hybrid(model, X_pool_temp, batch_size) 
            
            else:
                uncertain_indices = select_uncertain_samples(model, X_pool_temp, method, batch_size)
            # Mise à jour des ensembles labellisés et non-labellisés
            X_train_temp, y_train_temp, X_pool_temp, y_pool_temp = update_labeled_unlabeled_sets(
                X_train_temp, y_train_temp, X_pool_temp, y_pool_temp, uncertain_indices
            )

            print(f"{method} - Iteration {i+1}: {len(X_train_temp)} samples labeled, Accuracy: {acc:.4f}")

        # Calcul de la précision finale
        final_accuracies[method] = accuracy_score(y_test, model.predict(X_test))
        print(f"Final Accuracy ({method}): {final_accuracies[method]:.4f}")

    # === Affichage des résultats ===
    # 1. Graphique de l'évolution de l'accuracy
    fig = px.line(title="Évolution de l'accuracy au fil des itérations d'Active Learning")
    for method in methods:
        fig.add_scatter(x=list(range(1, n_iterations + 1)), y=accuracies[method], mode='lines+markers', name=method)
    fig.update_layout(xaxis_title="Iteration", yaxis_title="Accuracy")
    fig.show()

    # 2. Comparaison des accuracy finales
    fig = px.bar(x=methods, y=[final_accuracies[m] for m in methods], labels={"x": "Méthode", "y": "Final Accuracy"},
                 title="Comparaison des Accuracy Finales")
    fig.update_layout(yaxis=dict(range=[min(final_accuracies.values()) - 0.01, max(final_accuracies.values()) + 0.01]))
    fig.show()

    # 3. Comparaison de l'amélioration de l'accuracy
    accuracy_improvements = {method: final_accuracies[method] - initial_accuracy for method in methods}
    fig = px.bar(x=methods, y=[accuracy_improvements[m] for m in methods], labels={"x": "Méthode", "y": "Amélioration d'Accuracy"},
                 title="Amélioration de l'Accuracy sur tout le dataset")
    fig.update_layout(yaxis=dict(range=[min(accuracy_improvements.values()) - 0.01, max(accuracy_improvements.values()) + 0.01]))
    fig.show()

    print(f"Initial Accuracy: {initial_accuracy:.4f}")
    for method in methods:
        print(f"Accuracy Improvement ({method}): {accuracy_improvements[method]:.4f}")


Ne pas toucher : random dataset

In [53]:
# === Paramètres ===
n_samples = 100000  # Nombre total d'échantillons
n_features = 10  # Nombre de features
n_classes = 2  # Nombre de classes
n_redundant = 0  # Nombre de features redondantes
n_informative=2
n_labeled = 600  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = int(n_samples / 5000)  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"]  # Méthodes d'incertitude
model_class=lambda : RandomForestClassifier()


run_active_learning_experiment(n_samples, n_features, n_classes, n_redundant, n_informative, n_labeled, test_ratio, n_iterations, batch_size, methods,model_class)



Taille de l'ensemble total: 100000
Taille de l'ensemble labellisé: 600
Taille de l'ensemble non-labellisé: 79400
Taille de l'ensemble de test: 20000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 20
random - Iteration 1: 620 samples labeled, Accuracy: 0.9364
random - Iteration 2: 640 samples labeled, Accuracy: 0.9387
random - Iteration 3: 660 samples labeled, Accuracy: 0.9422
random - Iteration 4: 680 samples labeled, Accuracy: 0.9357
random - Iteration 5: 700 samples labeled, Accuracy: 0.9386
random - Iteration 6: 720 samples labeled, Accuracy: 0.9379
random - Iteration 7: 740 samples labeled, Accuracy: 0.9376
random - Iteration 8: 760 samples labeled, Accuracy: 0.9344
random - Iteration 9: 780 samples labeled, Accuracy: 0.9364
random - Iteration 10: 800 samples labeled, Accuracy: 0.9366
random - Iteration 11: 820 samples labeled, Accuracy: 0.9345
random - Iteration 12: 840 samples labeled, Accuracy: 0.9377
random - Iteration 13: 860 samples labeled, Accurac

Initial Accuracy: 0.9348
Accuracy Improvement (random): 0.0231
Accuracy Improvement (least_confident): 0.0312
Accuracy Improvement (margin): 0.0316
Accuracy Improvement (entropy): 0.0336
Accuracy Improvement (hybrid): 0.0324


In [54]:
# === Paramètres ===
n_samples = 100000  # Nombre total d'échantillons
n_features = 10  # Nombre de features
n_classes = 5  # Nombre de classes
n_redundant = 0  # Nombre de features redondantes
n_informative=3
n_labeled = 600  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 100  # Nombre d'itérations d'Active Learning
batch_size = int(n_samples / 5000)  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"]  # Méthodes d'incertitude
model_class=lambda : RandomForestClassifier()


run_active_learning_experiment(n_samples, n_features, n_classes, n_redundant, n_informative, n_labeled, test_ratio, n_iterations, batch_size, methods,model_class)


Taille de l'ensemble total: 100000
Taille de l'ensemble labellisé: 600
Taille de l'ensemble non-labellisé: 79400
Taille de l'ensemble de test: 20000
Nb d'itérations: 100
Nb de données labellisées en plus à chaque itération: 20
random - Iteration 1: 620 samples labeled, Accuracy: 0.7127
random - Iteration 2: 640 samples labeled, Accuracy: 0.7124
random - Iteration 3: 660 samples labeled, Accuracy: 0.7136
random - Iteration 4: 680 samples labeled, Accuracy: 0.7113
random - Iteration 5: 700 samples labeled, Accuracy: 0.7140
random - Iteration 6: 720 samples labeled, Accuracy: 0.7105
random - Iteration 7: 740 samples labeled, Accuracy: 0.7137
random - Iteration 8: 760 samples labeled, Accuracy: 0.7141
random - Iteration 9: 780 samples labeled, Accuracy: 0.7118
random - Iteration 10: 800 samples labeled, Accuracy: 0.7171
random - Iteration 11: 820 samples labeled, Accuracy: 0.7185
random - Iteration 12: 840 samples labeled, Accuracy: 0.7138
random - Iteration 13: 860 samples labeled, Accura

Initial Accuracy: 0.7143
Accuracy Improvement (random): 0.0303
Accuracy Improvement (least_confident): 0.0359
Accuracy Improvement (margin): 0.0440
Accuracy Improvement (entropy): 0.0266
Accuracy Improvement (hybrid): 0.0309


In [58]:
# === Paramètres ===
n_samples = 100000  # Nombre total d'échantillons
n_features = 20 # Nombre de features
n_classes = 3  # Nombre de classes
n_redundant = 0  # Nombre de features redondantes
n_informative=5
n_labeled = 600  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 100  # Nombre d'itérations d'Active Learning
batch_size = int(n_samples / 5000)  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
model_class=lambda : RandomForestClassifier()


run_active_learning_experiment(n_samples, n_features, n_classes, n_redundant, n_informative, n_labeled, test_ratio, n_iterations, batch_size, methods,model_class)


Taille de l'ensemble total: 100000
Taille de l'ensemble labellisé: 600
Taille de l'ensemble non-labellisé: 79400
Taille de l'ensemble de test: 20000
Nb d'itérations: 100
Nb de données labellisées en plus à chaque itération: 20
random - Iteration 1: 620 samples labeled, Accuracy: 0.7081
random - Iteration 2: 640 samples labeled, Accuracy: 0.7138
random - Iteration 3: 660 samples labeled, Accuracy: 0.7142
random - Iteration 4: 680 samples labeled, Accuracy: 0.7175
random - Iteration 5: 700 samples labeled, Accuracy: 0.7163
random - Iteration 6: 720 samples labeled, Accuracy: 0.7196
random - Iteration 7: 740 samples labeled, Accuracy: 0.7146
random - Iteration 8: 760 samples labeled, Accuracy: 0.7234
random - Iteration 9: 780 samples labeled, Accuracy: 0.7194
random - Iteration 10: 800 samples labeled, Accuracy: 0.7198
random - Iteration 11: 820 samples labeled, Accuracy: 0.7238
random - Iteration 12: 840 samples labeled, Accuracy: 0.7167
random - Iteration 13: 860 samples labeled, Accura

Initial Accuracy: 0.7132
Accuracy Improvement (random): 0.0405
Accuracy Improvement (least_confident): 0.0510
Accuracy Improvement (margin): 0.0488
Accuracy Improvement (entropy): 0.0471
Accuracy Improvement (hybrid): 0.0489


In [11]:
# === Paramètres ===
n_samples = 100000  # Nombre total d'échantillons
n_features = 20 # Nombre de features
n_classes = 3  # Nombre de classes
n_redundant = 0  # Nombre de features redondantes
n_informative=5
n_labeled = 600  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 100  # Nombre d'itérations d'Active Learning
batch_size = 100  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
model_class=lambda : RandomForestClassifier()


run_active_learning_experiment(n_samples, n_features, n_classes, n_redundant, n_informative, n_labeled, test_ratio, n_iterations, batch_size, methods,model_class)


Taille de l'ensemble total: 100000
Taille de l'ensemble labellisé: 600
Taille de l'ensemble non-labellisé: 79400
Taille de l'ensemble de test: 20000
Nb d'itérations: 100
Nb de données labellisées en plus à chaque itération: 100
random - Iteration 1: 700 samples labeled, Accuracy: 0.7100
random - Iteration 2: 800 samples labeled, Accuracy: 0.7146
random - Iteration 3: 900 samples labeled, Accuracy: 0.7137
random - Iteration 4: 1000 samples labeled, Accuracy: 0.7245
random - Iteration 5: 1100 samples labeled, Accuracy: 0.7267
random - Iteration 6: 1200 samples labeled, Accuracy: 0.7308
random - Iteration 7: 1300 samples labeled, Accuracy: 0.7403
random - Iteration 8: 1400 samples labeled, Accuracy: 0.7398
random - Iteration 9: 1500 samples labeled, Accuracy: 0.7428
random - Iteration 10: 1600 samples labeled, Accuracy: 0.7463
random - Iteration 11: 1700 samples labeled, Accuracy: 0.7496
random - Iteration 12: 1800 samples labeled, Accuracy: 0.7483
random - Iteration 13: 1900 samples labe

Initial Accuracy: 0.7160
Accuracy Improvement (random): 0.0614
Accuracy Improvement (least_confident): 0.0678
Accuracy Improvement (margin): 0.0668
Accuracy Improvement (entropy): 0.0658
Accuracy Improvement (hybrid): 0.0652


Sur MNIST

In [14]:
def read_images(filename):
    with open(filename, 'rb') as file:
        magic_number, nb_images, nb_rows, nb_cols = struct.unpack('>IIII', file.read(16))
        assert magic_number == 2051, "Wrong file"
        image_data = np.fromfile(file, dtype=np.uint8).reshape(nb_images, nb_rows, nb_cols)
    return image_data

def read_targets(filename):
    with open(filename, 'rb') as file:
        magic_number, nb_items = struct.unpack('>II', file.read(8))
        assert magic_number == 2049, "Wrong file"
        targets = np.fromfile(file, dtype=np.uint8)
    return targets

In [15]:
path_images_file = '../t10k-images.idx3-ubyte'
path_to_targets = '../t10k-labels.idx1-ubyte'

In [20]:
X = read_images(path_images_file).reshape(-1, 28*28)
y = read_targets(path_to_targets)

print(np.shape(X),np.shape(y))

(10000, 784) (10000,)


In [None]:

n_labeled = 600  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 50  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
model_class=lambda : RandomForestClassifier()


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 600
Taille de l'ensemble non-labellisé: 7400
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 50
random - Iteration 1: 650 samples labeled, Accuracy: 0.8540
random - Iteration 2: 700 samples labeled, Accuracy: 0.8720
random - Iteration 3: 750 samples labeled, Accuracy: 0.8855
random - Iteration 4: 800 samples labeled, Accuracy: 0.8845
random - Iteration 5: 850 samples labeled, Accuracy: 0.8855
random - Iteration 6: 900 samples labeled, Accuracy: 0.8920
random - Iteration 7: 950 samples labeled, Accuracy: 0.8965
random - Iteration 8: 1000 samples labeled, Accuracy: 0.8935
random - Iteration 9: 1050 samples labeled, Accuracy: 0.8975
random - Iteration 10: 1100 samples labeled, Accuracy: 0.8980
random - Iteration 11: 1150 samples labeled, Accuracy: 0.8995
random - Iteration 12: 1200 samples labeled, Accuracy: 0.9045
random - Iteration 13: 1250 samples labeled, Accu

Initial Accuracy: 0.8736
Accuracy Improvement (random): 0.0599
Accuracy Improvement (least_confident): 0.0794
Accuracy Improvement (margin): 0.0824
Accuracy Improvement (entropy): 0.0779
Accuracy Improvement (hybrid): 0.0834


In [23]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 10  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
model_class=lambda : RandomForestClassifier()


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 10
random - Iteration 1: 110 samples labeled, Accuracy: 0.6855
random - Iteration 2: 120 samples labeled, Accuracy: 0.6985
random - Iteration 3: 130 samples labeled, Accuracy: 0.7055
random - Iteration 4: 140 samples labeled, Accuracy: 0.7310
random - Iteration 5: 150 samples labeled, Accuracy: 0.7370
random - Iteration 6: 160 samples labeled, Accuracy: 0.7490
random - Iteration 7: 170 samples labeled, Accuracy: 0.7665
random - Iteration 8: 180 samples labeled, Accuracy: 0.7790
random - Iteration 9: 190 samples labeled, Accuracy: 0.7850
random - Iteration 10: 200 samples labeled, Accuracy: 0.7925
random - Iteration 11: 210 samples labeled, Accuracy: 0.7885
random - Iteration 12: 220 samples labeled, Accuracy: 0.8095
random - Iteration 13: 230 samples labeled, Accuracy: 

Initial Accuracy: 0.6771
Accuracy Improvement (random): 0.2054
Accuracy Improvement (least_confident): 0.2159
Accuracy Improvement (margin): 0.2444
Accuracy Improvement (entropy): 0.2119
Accuracy Improvement (hybrid): 0.2159


In [24]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 18  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
model_class=lambda : RandomForestClassifier()


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 18
random - Iteration 1: 118 samples labeled, Accuracy: 0.6820
random - Iteration 2: 136 samples labeled, Accuracy: 0.6965
random - Iteration 3: 154 samples labeled, Accuracy: 0.7705
random - Iteration 4: 172 samples labeled, Accuracy: 0.7920
random - Iteration 5: 190 samples labeled, Accuracy: 0.7950
random - Iteration 6: 208 samples labeled, Accuracy: 0.7900
random - Iteration 7: 226 samples labeled, Accuracy: 0.8030
random - Iteration 8: 244 samples labeled, Accuracy: 0.8150
random - Iteration 9: 262 samples labeled, Accuracy: 0.8100
random - Iteration 10: 280 samples labeled, Accuracy: 0.8255
random - Iteration 11: 298 samples labeled, Accuracy: 0.8255
random - Iteration 12: 316 samples labeled, Accuracy: 0.8225
random - Iteration 13: 334 samples labeled, Accuracy: 

Initial Accuracy: 0.6754
Accuracy Improvement (random): 0.2291
Accuracy Improvement (least_confident): 0.2516
Accuracy Improvement (margin): 0.2641
Accuracy Improvement (entropy): 0.2476
Accuracy Improvement (hybrid): 0.2416


In [91]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 18  # Taille du batch d'échantillons ajoutés à chaque itération
#methods = ["random", "least_confident", "margin", "entropy","hybrid"] 
methods=["random", "least_confident", "margin", "entropy","hybrid","qbc"]
model_class=lambda : RandomForestClassifier()
# Créer un comité avec 3 modèles différents
# Liste de modèles pour le comité
models = [
    clone(RandomForestClassifier()),  # Random Forest
    clone(LogisticRegression(max_iter=1000)),                     # Régression Logistique
    clone(SVC(probability=True)),
    clone(KNeighborsClassifier(n_neighbors=5)),      # k-NN
    clone(DecisionTreeClassifier()),                 # Arbre de décision
    clone(GaussianNB()),                             # Naïve Bayes
    clone(GradientBoostingClassifier()),             # Gradient Boosting
    clone(MLPClassifier()),                          # Réseau de neurones simple
    clone(RidgeClassifier())                       # Régression Ridge pour classificati)           # Analyse discriminante quadratique
]                # SVM avec probabilités
        # Analyse discriminante quadratique


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 18
(18,)
random - Iteration 1: 118 samples labeled, Accuracy: 0.6875
(18,)
random - Iteration 2: 136 samples labeled, Accuracy: 0.7255
(18,)
random - Iteration 3: 154 samples labeled, Accuracy: 0.7420
(18,)
random - Iteration 4: 172 samples labeled, Accuracy: 0.7480
(18,)
random - Iteration 5: 190 samples labeled, Accuracy: 0.7715
(18,)
random - Iteration 6: 208 samples labeled, Accuracy: 0.7865
(18,)
random - Iteration 7: 226 samples labeled, Accuracy: 0.7980
(18,)
random - Iteration 8: 244 samples labeled, Accuracy: 0.8115
(18,)
random - Iteration 9: 262 samples labeled, Accuracy: 0.8195
(18,)
random - Iteration 10: 280 samples labeled, Accuracy: 0.8275
(18,)
random - Iteration 11: 298 samples labeled, Accuracy: 0.8275
(18,)
random - Iteration 12: 316 samples labeled,

Initial Accuracy: 0.6680
Accuracy Improvement (random): 0.2235
Accuracy Improvement (least_confident): 0.2700
Accuracy Improvement (margin): 0.2710
Accuracy Improvement (entropy): 0.2575
Accuracy Improvement (hybrid): 0.2575
Accuracy Improvement (qbc): 0.1670


In [120]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 18  # Taille du batch d'échantillons ajoutés à chaque itération
#methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
model_class=lambda : RandomForestClassifier()
# Créer un comité avec 3 modèles différents
# Liste de modèles pour le comité
models = [
    clone(RandomForestClassifier()),  # Random Forest
    clone(LogisticRegression(max_iter=1000)),                     # Régression Logistique
    clone(SVC(probability=True)),                   # Régression Ridge pour classificati)           # Analyse discriminante quadratique
]                # SVM avec probabilités
        # Analyse discriminante quadratique


# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 18
random - Iteration 1: 118 samples labeled, Accuracy: 0.6830
random - Iteration 2: 136 samples labeled, Accuracy: 0.7140
random - Iteration 3: 154 samples labeled, Accuracy: 0.7235
random - Iteration 4: 172 samples labeled, Accuracy: 0.7290
random - Iteration 5: 190 samples labeled, Accuracy: 0.7255
random - Iteration 6: 208 samples labeled, Accuracy: 0.7515
random - Iteration 7: 226 samples labeled, Accuracy: 0.7645
random - Iteration 8: 244 samples labeled, Accuracy: 0.7865
random - Iteration 9: 262 samples labeled, Accuracy: 0.7925
random - Iteration 10: 280 samples labeled, Accuracy: 0.7920
random - Iteration 11: 298 samples labeled, Accuracy: 0.8000
random - Iteration 12: 316 samples labeled, Accuracy: 0.8135
random - Iteration 13: 334 samples labeled, Accuracy: 

Initial Accuracy: 0.6807
Accuracy Improvement (random): 0.2208
Accuracy Improvement (least_confident): 0.2513
Accuracy Improvement (margin): 0.2543
Accuracy Improvement (entropy): 0.2403
Accuracy Improvement (hybrid): 0.2373
Accuracy Improvement (qbc-variance): 0.2423
Accuracy Improvement (qbc-entropy): 0.2593
Accuracy Improvement (qbc-KL): 0.1733


In [None]:

n_labeled = 100  # Nombre d'échantillons labellisés initialement
test_ratio = 0.2  # Proportion du dataset réservée au test
n_iterations = 50  # Nombre d'itérations d'Active Learning
batch_size = 18  # Taille du batch d'échantillons ajoutés à chaque itération
#methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
methods=["random", "least_confident", "margin", "entropy","hybrid","qbc-variance","qbc-entropy","qbc-KL"]
model_class=lambda : RandomForestClassifier()
# Créer un comité avec 3 modèles différents
# Liste de modèles pour le comité
models = [
    clone(RandomForestClassifier()),          # Random Forest
    clone(GradientBoostingClassifier()),      # Gradient Boosting
    clone(AdaBoostClassifier()),              # AdaBoost
    clone(LogisticRegression(max_iter=1000)), # Régression Logistique
    clone(SVC(probability=True)),             # Support Vector Classifier
    clone(KNeighborsClassifier()),            # K-Nearest Neighbors
    clone(GaussianNB()),                      # Naive Bayes
    clone(DecisionTreeClassifier()),          # Decision Tree
    clone(MLPClassifier(max_iter=1000))
]

# Exécution de l'expérience
run_active_learning_experiment(X, y, n_labeled, test_ratio, n_iterations, batch_size, methods, model_class,models)


Taille de l'ensemble total: 10000
Taille de l'ensemble labellisé: 100
Taille de l'ensemble non-labellisé: 7900
Taille de l'ensemble de test: 2000
Nb d'itérations: 50
Nb de données labellisées en plus à chaque itération: 18
random - Iteration 1: 118 samples labeled, Accuracy: 0.6750
random - Iteration 2: 136 samples labeled, Accuracy: 0.7165
random - Iteration 3: 154 samples labeled, Accuracy: 0.7650
random - Iteration 4: 172 samples labeled, Accuracy: 0.7965
random - Iteration 5: 190 samples labeled, Accuracy: 0.8055
random - Iteration 6: 208 samples labeled, Accuracy: 0.7925
random - Iteration 7: 226 samples labeled, Accuracy: 0.7910
random - Iteration 8: 244 samples labeled, Accuracy: 0.8080
random - Iteration 9: 262 samples labeled, Accuracy: 0.8140
random - Iteration 10: 280 samples labeled, Accuracy: 0.8140
random - Iteration 11: 298 samples labeled, Accuracy: 0.8200
random - Iteration 12: 316 samples labeled, Accuracy: 0.8330
random - Iteration 13: 334 samples labeled, Accuracy: 