# Chapter 7: Utilizing Tools and Packages for Active ML

## Imports

In [None]:
!pip install scikit-learn

In [None]:
!pip install modAL-python

In [5]:
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import numpy as np
import random
from modAL.models import ActiveLearner, Committee
from sklearn.ensemble import RandomForestClassifier
from modAL.uncertainty import uncertainty_sampling
import os
from PIL import Image
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from modAL.disagreement import vote_entropy_sampling
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader


## Customer Segmentation with KMeans in Scikit-learn

In [None]:
# Mock customer data (age, annual income)
X = np.array([[34, 20000], [42, 30000], [23, 25000], [32, 45000], [38, 30000]])

# Using KMeans for clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

# Predicting the cluster for each customer
clusters = kmeans.predict(X)

# Setting up a classifier
classifier = LogisticRegression()
classifier.fit(X, clusters)  # Training the classifier on our initial labeled data

# Simulating additional unlabeled data
X_unlabeled = np.array([[28, 22000], [45, 55000], [37, 35000], [50, 48000], [29, 27000], [41, 32000]])

# Function to obtain labels for selected instances
def obtain_labels(data):
    # This is a placeholder function. In a real scenario, we would obtain these labels through some form of data collection.
    # Here, we'll simulate this by randomly assigning one of the existing cluster labels.
    return np.random.choice([0, 1], size=len(data))

# Active Learning Loop
num_iterations = 10  # number of iterations
num_to_label = 2    # number of instances to label in each iteration

for iteration in range(num_iterations):
    if len(X_unlabeled) == 0:
        break  # No more data to label

    # Predict on unlabeled data
    predictions = classifier.predict_proba(X_unlabeled)
    uncertainty = np.max(predictions, axis=1)

    # Select num_to_label instances with least confidence
    uncertain_indices = np.argsort(uncertainty)[:num_to_label]

    # Obtain labels for these instances
    new_labels = obtain_labels(X_unlabeled[uncertain_indices])

    # Update our dataset
    X = np.vstack([X, X_unlabeled[uncertain_indices]])
    clusters = np.hstack([clusters, new_labels])

    # Re-train classifier and KMeans
    classifier.fit(X, clusters)
    kmeans.fit(X)

    print(f"Iteration {iteration+1}, Labeled Data: {X_unlabeled[uncertain_indices]} with Labels: {new_labels}")

    # Remove labeled instances from unlabeled data
    X_unlabeled = np.delete(X_unlabeled, uncertain_indices, axis=0)

    # Shuffle unlabeled data to avoid any order bias
    X_unlabeled = shuffle(X_unlabeled)


Iteration 1, Labeled Data: [[   45 55000]
 [   29 27000]] with Labels: [0 1]
Iteration 2, Labeled Data: [[   37 35000]
 [   28 22000]] with Labels: [1 1]
Iteration 3, Labeled Data: [[   41 32000]
 [   50 48000]] with Labels: [0 0]




## modAL for Image Classification CIFAR10

In [12]:
def load_data():
    # Define the transformation
    transform = transforms.Compose([
        transforms.ToTensor(),  # Convert images to PyTorch tensors
    ])

    # Load the CIFAR10 dataset
    dataset = CIFAR10(root='data', train=True, download=True, transform=transform)

    # Load all data into memory (for small datasets)
    dataloader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
    data_iter = iter(dataloader)
    images, labels = next(data_iter)

    # Convert images and labels to numpy arrays
    X_all = images.numpy()
    y_all = np.array(labels)

    # Flatten images from 3D to 1D (batch_size, 3, 32, 32) -> (batch_size, 3072) for RandomForest
    X_all = X_all.reshape(X_all.shape[0], -1)

    # Map numerical labels to string labels
    class_names = dataset.classes
    y_all = np.array([class_names[label] for label in y_all])

    return X_all, y_all

X_all, y_all = load_data()

# Example usage
print(f"Shape of X_all: {X_all.shape}")
print(f"First 5 labels: {y_all[:5]}")


Files already downloaded and verified
Shape of X_all: (50000, 3072)
First 5 labels: ['frog' 'truck' 'truck' 'deer' 'automobile']


In [13]:
y_all

array(['frog', 'truck', 'truck', ..., 'truck', 'automobile', 'automobile'],
      dtype='<U10')

In [14]:
X_initial, X_unlabeled, y_initial, _ = train_test_split(X_all, y_all, test_size=0.75, random_state=42)
print(f"We are starting our example with {len(X_initial)} labeled images and {len(X_unlabeled)} unlabeled images")

We are starting our example with 12500 labeled images and 37500 unlabeled images


In [15]:
# Initialize the learner
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    query_strategy=uncertainty_sampling,
    X_training=X_initial, y_training=y_initial
)

# Active learning loop
for i in range(5):
    query_idx, _ = learner.query(X_unlabeled)
    # Retrieve the actual label for the queried instance
    actual_label = y_all[query_idx[0]]  # Assuming query_idx[0] because query returns a batch of queries

    # Simulate obtaining the label (in practice, you would get this from an oracle or human annotator)
    # Since we're simulating, we use the actual label from the dataset
    print(f"Selected unlabeled query is sample number {query_idx[0]}. Actual label: {actual_label}")

    # Teach the learner with the newly labeled example
    learner.teach(X_unlabeled[query_idx].reshape(1, -1), actual_label.reshape(1,))

    # Remove the queried instance from the unlabeled pool
    X_unlabeled = np.delete(X_unlabeled, query_idx, axis=0)
    # Also, update the labels to reflect the removal of the labeled instance
    y_all = np.delete(y_all, query_idx)  # Ensure this matches how you handle labels for the unlabeled pool

Selected unlabeled query is sample number 3100. Actual label: cat
Selected unlabeled query is sample number 7393. Actual label: deer
Selected unlabeled query is sample number 4728. Actual label: horse
Selected unlabeled query is sample number 447. Actual label: deer
Selected unlabeled query is sample number 17968. Actual label: bird


## modAL for QueryBy Committee on the Iris dataset

In [None]:
# Load the Iris dataset
X, y = load_iris(return_X_y=True)

In [None]:
# Initial split for labeled and unlabeled data
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X, y, test_size=0.9, random_state=42)

In [None]:
# Create multiple learners for the committee
n_learners = 20
learners = [ActiveLearner(estimator=RandomForestClassifier(), X_training=X_labeled, y_training=y_labeled) for _ in range(n_learners)]

# Create the committee
committee = Committee(learner_list=learners, query_strategy=vote_entropy_sampling)

# Active learning loop
n_queries = 5
for idx in range(n_queries):
    query_idx, query_instance = committee.query(X_unlabeled)
    print(f"\nSelected unlabeled query is sample number {query_idx}. We simulate labeling this sample which is labeled as: {y_unlabeled[query_idx]}")
    committee.teach(X_unlabeled[query_idx], y_unlabeled[query_idx])

    # Remove the queried instance from the pool
    X_unlabeled = np.delete(X_unlabeled, query_idx, axis=0)
    y_unlabeled = np.delete(y_unlabeled, query_idx)
    print(f"Number of unlabeled samples is {len(X_unlabeled)}")

    # Calculate and print committee score
    committee_score = committee.score(X, y)
    print(f"Iteration {idx+1}, Committee Score: {committee_score}")



Selected unlabeled query is sample number [8]. We simulate labeling this sample which is labeled as: [0]
Number of unlabeled samples is 129
Iteration 1, Committee Score: 0.96

Selected unlabeled query is sample number [125]. We simulate labeling this sample which is labeled as: [2]
Number of unlabeled samples is 128
Iteration 2, Committee Score: 0.9466666666666667

Selected unlabeled query is sample number [42]. We simulate labeling this sample which is labeled as: [2]
Number of unlabeled samples is 127
Iteration 3, Committee Score: 0.9466666666666667

Selected unlabeled query is sample number [47]. We simulate labeling this sample which is labeled as: [1]
Number of unlabeled samples is 126
Iteration 4, Committee Score: 0.9733333333333334

Selected unlabeled query is sample number [95]. We simulate labeling this sample which is labeled as: [1]
Number of unlabeled samples is 125
Iteration 5, Committee Score: 0.9733333333333334


In [None]:
# Final evaluation
final_score = committee.score(X, y)
print(f"Final Committee Score: {final_score}")

Final Committee Score: 0.9733333333333334
