In [181]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.stats import entropy
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px

from sklearn.metrics import accuracy_score

In [182]:
def generate_dataset(n_samples, n_features, n_classes, n_redundant):
    return make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, 
                               n_redundant=n_redundant, n_clusters_per_class=1, random_state=42)




In [183]:
def split_dataset(X, y, nb_labeled=600, test_ratio=0.2):
    limit = int(test_ratio * np.shape(X)[0])
    X_train, X_pool, y_train, y_pool = train_test_split(X, y, test_size=1 - nb_labeled / np.shape(X)[0], random_state=42)
    X_test, y_test = X_pool[:limit], y_pool[:limit]
    X_pool, y_pool = X_pool[limit:], y_pool[limit:]
    
    return X_train, X_pool, y_train, y_pool, X_test, y_test

In [184]:
# Fonction pour calculer l'incertitude
def calculate_uncertainty(probabilities, method="entropy"):
    if method == "entropy":
        return entropy(probabilities.T, base=2)
    elif method == "margin":
        sorted_probs = np.sort(probabilities, axis=1)
        return sorted_probs[:, -1] - sorted_probs[:, -2]
    elif method == "least_confident":
        return 1 - np.max(probabilities, axis=1)
    elif method == "random":
        return np.random.rand(probabilities.shape[0])
    else:
        raise ValueError("Méthode d'incertitude non reconnue")

In [185]:
def train_and_evaluate(X_train, y_train, X_test, y_test):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

In [186]:
def select_uncertain_samples(model, X_pool, method, batch_size):
    probabilities = model.predict_proba(X_pool)
    uncertainties = calculate_uncertainty(probabilities, method=method)
    if method == "margin":
        return np.argsort(uncertainties)[:batch_size]
    else:
        return np.argsort(uncertainties)[-batch_size:]

In [187]:
def update_labeled_unlabeled_sets(X_train, y_train, X_pool, y_pool, uncertain_indices):
    X_train = np.vstack((X_train, X_pool[uncertain_indices]))
    y_train = np.hstack((y_train, y_pool[uncertain_indices]))
    
    mask = np.ones(len(X_pool), dtype=bool)
    mask[uncertain_indices] = False
    X_pool, y_pool = X_pool[mask], y_pool[mask]
    
    return X_train, y_train, X_pool, y_pool

In [189]:
# === Paramètres ===
n_samples = 100000  # Nombre total d'échantillons
n_features = 10  # Nombre de features
n_classes = 2  # Nombre de classes
n_redundant = 0  # Nombre de features redondantes
n_iterations = 10  # Nombre d'itérations d'Active Learning
batch_size = int(n_samples / 1000)  # Taille du batch d'échantillons ajoutés à chaque itération
methods = ["random", "least_confident", "margin", "entropy"]  # Méthodes d'incertitude

# === Chargement du dataset ===
X, y = generate_dataset(n_samples, n_features, n_classes, n_redundant)
fig = px.scatter(x=X[:, 0], y=X[:, 1], color=y.astype(str), title="Dataset Initial",
                 labels={"color": "Classe"})
fig.show()

X_train, X_pool, y_train, y_pool, X_test, y_test = split_dataset(X, y)

# Affichage des tailles des ensembles
total_size = np.shape(X)[0]
print(f"Taille de l'ensemble total: {total_size}")
print(f"Taille de l'ensemble labellisé: {np.shape(X_train)[0]}")
print(f"Taille de l'ensemble non-labellisé: {np.shape(X_pool)[0]}")
print(f"Taille de l'ensemble de test: {np.shape(X_test)[0]}")

print(f"Nb d'itérations: {n_iterations}")
print(f"Nb de données labellisées en plus à chaque itération: {batch_size}")

# === Initialisation des variables ===
accuracies = {method: [] for method in methods}
final_accuracies = {}

# Évaluation initiale
y_pred_initial = RandomForestClassifier().fit(X_train, y_train).predict(X)
initial_accuracy = accuracy_score(y, y_pred_initial)

# === Fonctions pour Active Learning ===
def train_and_evaluate(X_train, y_train, X_test, y_test):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, accuracy_score(y_test, y_pred)

def select_uncertain_samples(model, X_pool, method, batch_size):
    probabilities = model.predict_proba(X_pool)
    uncertainties = calculate_uncertainty(probabilities, method=method)
    if method == "margin":
        return np.argsort(uncertainties)[:batch_size]
    else:
        return np.argsort(uncertainties)[-batch_size:]

def update_labeled_unlabeled_sets(X_train, y_train, X_pool, y_pool, uncertain_indices):
    X_train = np.vstack((X_train, X_pool[uncertain_indices]))
    y_train = np.hstack((y_train, y_pool[uncertain_indices]))
    
    mask = np.ones(len(X_pool), dtype=bool)
    mask[uncertain_indices] = False
    X_pool, y_pool = X_pool[mask], y_pool[mask]
    
    return X_train, y_train, X_pool, y_pool

# === Boucle principale d'Active Learning ===
for method in methods:
    X_train_temp, y_train_temp = X_train.copy(), y_train.copy()
    X_pool_temp, y_pool_temp = X_pool.copy(), y_pool.copy()
    
    for i in range(n_iterations):
        model, acc = train_and_evaluate(X_train_temp, y_train_temp, X_test, y_test)
        accuracies[method].append(acc)
        
        uncertain_indices = select_uncertain_samples(model, X_pool_temp, method, batch_size)
        X_train_temp, y_train_temp, X_pool_temp, y_pool_temp = update_labeled_unlabeled_sets(
            X_train_temp, y_train_temp, X_pool_temp, y_pool_temp, uncertain_indices
        )
        
        print(f"{method} - Iteration {i+1}: {len(X_train_temp)} samples labeled, Accuracy: {acc:.4f}")
    
    final_accuracies[method] = accuracy_score(y, model.predict(X))
    print(f"Final Accuracy ({method}): {final_accuracies[method]:.4f}")

# === Affichage des résultats ===
fig = px.line(title="Évolution de l'accuracy au fil des itérations d'Active Learning")
for method in methods:
    fig.add_scatter(x=list(range(1, n_iterations + 1)), y=accuracies[method], mode='lines+markers', name=method)
fig.update_layout(xaxis_title="Iteration", yaxis_title="Accuracy")
fig.show()

# Comparaison des accuracy finales
fig = px.bar(x=methods, y=[final_accuracies[m] for m in methods], labels={"x": "Méthode", "y": "Final Accuracy"},
             title="Comparaison des Accuracy Finales")
fig.update_layout(yaxis=dict(range=[min(final_accuracies.values()) - 0.01, max(final_accuracies.values()) + 0.01]))
fig.show()

# Comparaison de l'amélioration de l'accuracy
accuracy_improvements = {method: final_accuracies[method] - initial_accuracy for method in methods}
fig = px.bar(x=methods, y=[accuracy_improvements[m] for m in methods], labels={"x": "Méthode", "y": "Amélioration d'Accuracy"},
             title="Amélioration de l'Accuracy sur tout le dataset")
fig.update_layout(yaxis=dict(range=[min(accuracy_improvements.values()) - 0.01, max(accuracy_improvements.values()) + 0.01]))
fig.show()

print(f"Initial Accuracy: {initial_accuracy:.4f}")
for method in methods:
    print(f"Accuracy Improvement ({method}): {accuracy_improvements[method]:.4f}")


Taille de l'ensemble total: 100000
Taille de l'ensemble labellisé: 600
Taille de l'ensemble non-labellisé: 79400
Taille de l'ensemble de test: 20000
Nb d'itérations: 10
Nb de données labellisées en plus à chaque itération: 100
random - Iteration 1: 700 samples labeled, Accuracy: 0.9357
random - Iteration 2: 800 samples labeled, Accuracy: 0.9356
random - Iteration 3: 900 samples labeled, Accuracy: 0.9442
random - Iteration 4: 1000 samples labeled, Accuracy: 0.9422
random - Iteration 5: 1100 samples labeled, Accuracy: 0.9496
random - Iteration 6: 1200 samples labeled, Accuracy: 0.9480
random - Iteration 7: 1300 samples labeled, Accuracy: 0.9499
random - Iteration 8: 1400 samples labeled, Accuracy: 0.9477
random - Iteration 9: 1500 samples labeled, Accuracy: 0.9526
random - Iteration 10: 1600 samples labeled, Accuracy: 0.9542
Final Accuracy (random): 0.9551
least_confident - Iteration 1: 700 samples labeled, Accuracy: 0.9335
least_confident - Iteration 2: 800 samples labeled, Accuracy: 0.

Initial Accuracy: 0.9366
Accuracy Improvement (random): 0.0185
Accuracy Improvement (least_confident): 0.0242
Accuracy Improvement (margin): 0.0285
Accuracy Improvement (entropy): 0.0317
