The code below is for the SubSampler Class, which we will use to train the model using only a fraction of the data.

In [None]:
import torch.utils.data.sampler as samplers

class SubsetSampler(samplers.Sampler):

    def __init__(self, indices):
        self.indices = indices

    def __iter__(self):
        return (self.indices[i] for i in range(len(self.indices)))

    def __len__(self):
        return len(self.indices)

Usage:

- SubsetSampler takes the indices of the records of interest as an input.

This is the implementation for the ActiveStrategy Class.
An instance of that class is the controller for all Active Learning-related manipulations.

In [13]:
class ActiveStrategy(object):

    def __init__(self, neuralNet, nsteps, clear=True, verbose=True):
        self.clear = clear
        self.verbose = verbose
        self.net = neuralNet
        self.nsteps = nsteps
        self.train_length = len(trainset)
        self.test_length = len(testset)
        self.train_lbls = {}
        self.test_lbls = {}
        self.train_ind = {}
        self.test_ind = {}
        self.init_stats()
        self.train_filter = [ i for i in range(self.train_length)]
        self.test_filter = [ i for i in range(self.test_length)]
        self.train_sampler = SubsetSampler(self.train_filter)
        self.test_sampler = SubsetSampler(self.test_filter)
        self.statsloader = torch.utils.data.DataLoader(trainset,
                                                       shuffle=False,
                                                       batch_size=1,
                                                       num_workers=1)
        self.trainloader = torch.utils.data.DataLoader(trainset,
                                                       shuffle=False,
                                                       batch_size=1,
                                                       num_workers=1)
        self.testloader  = torch.utils.data.DataLoader(testset,
                                                       shuffle=False,
                                                       batch_size=1,
                                                       num_workers=1)
        self.load()
        self.experiments = []

    def init_stats(self):
        self.stats = {}
        empty_dict = {}
        for i in range(self.nsteps + 1):
            empty_dict[i] = 0
        for cl in classes:
            self.stats[cl] = empty_dict.copy()
            self.train_ind[cl] = []
            self.test_ind[cl] = []

    def update_stats(self, cl, sl):
        self.stats[cl][0]    += 1
        self.stats[cl][sl+1] += 1

    def load(self):
        for i, data in enumerate(self.statsloader, 0):
            inputs, labels = data
            sl = int(float(i) / self.train_length * self.nsteps)
            self.update_stats(classes[labels[0]], sl)
        for i, data in enumerate(self.trainloader, 0):
            inputs, labels = data
            self.train_lbls[i] = classes[labels[0]]
            #self.train_ind[classes[labels[0]]].append(i)
        for i, data in enumerate(self.testloader, 0):
            inputs, labels = data
            self.test_lbls[i] = classes[labels[0]]
            #self.test_ind[classes[labels[0]]].append(i)

    def init_loaders(self):
        self.trainloader = torch.utils.data.DataLoader(trainset,
                                                       shuffle=False,
                                                       batch_size=1,
                                                       num_workers=1,
                                                       sampler=self.train_sampler)
        #self.test_sampler = SubsetSampler(self.test_filter[0:100])
        self.testloader = torch.utils.data.DataLoader(testset,
                                                      shuffle=False,
                                                      batch_size=1,
                                                      num_workers=1,
                                                      sampler=self.test_sampler)
        
    def initialize_weights(self):
        for m in self.net.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight.data)

    def incremental_supervised(self):
        np.random.shuffle(self.train_filter)
        
    def load_strategy(self, selected):
        self.train_filter = selected
        
    def train(self):

        if self.clear:
            self.initialize_weights()

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(self.net.parameters(), lr=0.001, momentum=0.9)

        for epoch in range(2):

            running_loss = 0.0
            for i, data in enumerate(self.trainloader, 0):
                # get the inputs
                inputs, labels = data

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = self.net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # print statistics
                if self.verbose:
                    running_loss += loss.item()
                    if i % 2000 == 1999:    # print every 2000 mini-batches
                        print('[%d, %5d] loss: %.3f' %
                            (epoch + 1, i + 1, running_loss / 2000))
                        running_loss = 0.0

        print('Finished Training')
        
    def test(self):

        dataiter = iter(self.testloader)
        images, labels = dataiter.next()

        print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(1)))

        outputs = self.net(images)
        _, predicted = torch.max(outputs, 1)

        print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] for j in range(1)))

        soft = torch.nn.Softmax(dim=0)

        ##### Stats below ######

        ground_truth = []
        predictions  = []
        probabilities = []

        correct = 0
        total = 0
        with torch.no_grad():
            for data in self.testloader:
                images, labels = data
                outputs = self.net(images)
                _, predicted = torch.max(outputs.data, 1)
                ground_truth.append(labels.item())
                predictions.append(predicted.item())
                probabilities.append(soft(outputs[0]))
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print('Accuracy of the network on the {0} test images: {1}%'
                .format(self.test_length, 100 * correct / total))

        class_correct = {}
        class_total   = {}
        class_pred    = {}
        for cl in classes:
            class_correct[cl] = 0
            class_total[cl]   = 0
            class_pred[cl]    = 0

        with torch.no_grad():
            for data in self.testloader:
                images, labels = data
                outputs = self.net(images)
                _, predicted = torch.max(outputs, 1)
                c = (predicted == labels).squeeze()
                # hard-coded for now  (.item() to avoid warning - not true if batch_size > 1) (J.Prendki)
                label = labels.item()
                class_correct[classes[label]] += c.item()
                class_total[classes[label]] += 1
                class_pred[classes[predicted.item()]] += 1

        if self.verbose:
            for cl in classes:
                precision = class_correct[cl] / class_total[cl]
                Fscore = -99.0
                if class_pred[cl] > 0:
                    recall = class_correct[cl] / class_pred[cl]
                    Fscore    = 2.0 * precision * recall / (precision + recall)
                print('%5s : \t Accuracy: %2d %% \t F-Score %.2f' % (
                        cl,
                        100.0 * precision,
                        Fscore))

        return correct / total, ground_truth, predictions, probabilities

    def infer(self, sample):
        sampler = SubsetSampler(sample)
        dataloader = torch.utils.data.DataLoader(trainset,
                                                 shuffle=False,
                                                 batch_size=1,
                                                 num_workers=4,
                                                 sampler=sampler)
        soft = torch.nn.Softmax(dim=0)
        results = []
        with torch.no_grad():
            for r, rec in enumerate(dataloader):
                # Evan (2019/11/15): I added this line, convert the indices from 0~47999 to 0~49999
                r = sample[r] 

                images, labels = rec
                outputs = self.net(images)
                _, predicted = torch.max(outputs.data, 1)
                ground_truth = labels.item()
                prediction   = predicted.item()
                probability  = soft(outputs[0])
                classwiseprobs = probability.numpy()
                results.append([r, classes[ground_truth], classes[prediction], probability[prediction],classwiseprobs])
        return results
    
    def run_one(self, selected):
        self.load_strategy(selected)
        results = []
        if self.clear:
            self.initialize_weights()
            print("Network's weights reinitialized")
        print("Training for {0} records:".format(len(selected)))
        self.train_sampler = SubsetSampler(self.train_filter)
        self.trainloader = torch.utils.data.DataLoader(trainset,
                                                        shuffle=False,
                                                        batch_size=1,
                                                        num_workers=1,
                                                        sampler=self.train_sampler)
        self.train()
        res, truth, outs, probs = self.test()

        return res
    
    def run_experiment(self, nsteps, maximum):
        results = []
        for n in range(1, nsteps+1):
            if self.clear:
                self.initialize_weights()
                print("Network's weights reinitialized")
            nsamples = int(1.0 / nsteps * n * maximum)
            print("Training for {0} samples:".format(nsamples))
            self.train_sampler = SubsetSampler(self.train_filter[:nsamples])
            self.trainloader = torch.utils.data.DataLoader(trainset,
                                                           shuffle=False,
                                                           batch_size=1,
                                                           num_workers=4,
                                                           sampler=self.train_sampler)
            self.train()
            res, truth, outs, probs = self.test()
            results.append(res)

        return results
    
    def run_ConfidenceAL(self, qStrategy, nsteps, maximum):
    
        results = []
    
        unlabeled = [i for i in range(len(trainset))]
        labeled   = []

        to_be_labeled = random.sample(unlabeled, int(nps))
        unlabeled = list(set(unlabeled)-set(to_be_labeled))
        myres = self.run_one(to_be_labeled)
        results.append(myres)
    
        for n in range(1, nsteps):
            myResults = self.infer(unlabeled)
            to_be_labeled.extend( qStrategy(myResults, int(maximum/nsteps)) ) # updating function
            unlabeled = list(set(unlabeled)-set(to_be_labeled))
            myres = self.run_one(to_be_labeled)
            results.append(myres)
        
        return results
    
    def run_StreamingAL(self, qStrategy, nsteps, maximum):
    
        results = []
        stepSizes = []
    
        unlabeled = [i for i in range(len(trainset))]
        labeled   = []

        to_be_labeled = random.sample(unlabeled, int(nps))
        unlabeled = list(set(unlabeled)-set(to_be_labeled))
        myres = self.run_one(to_be_labeled)
        results.append(myres)
        stepSizes.append(len(to_be_labeled))
    
        for n in range(1, nsteps):
            myResults = self.infer(unlabeled)
            to_be_labeled.extend( qStrategy(myResults) ) # updating function
            if (len(to_be_labeled) > maximum):
                break
            unlabeled = list(set(unlabeled)-set(to_be_labeled))
            myres = self.run_one(to_be_labeled)
            results.append(myres)
            stepSizes.append(len(to_be_labeled))
        
        return results, stepSizes


Documentation:

- ALStrategy.infer(sample): run inferrences on all records in sample

- ALStrategy.run_experiment(num_steps, maximum): design to experiment on random samples (incremental supervised learning)

- ALStrategy.run_ConfidenceAL(qStrategy, num_steps, maximum): running a querying strategy of type confidence-level or, more generally, uncertainty (Pooling approach)

- ALStrategy.run_StreamingAL(qStrategy, num_steps, maximum): running a querying strategy with a Streaming approach