# Prototype to be implemented
## Model details
* **Task:** Image classification, outlier detection
* **Classifier:** Neural network, one-class
* **Feature** Subspace analysis to detect outliers
* **Semi-supervised**

## Rundown of the model
1. Image Loading
2. Pre-training phase
    * Training with trusted training data
    * Supervised learning
3. Training phase
    * Training with untrusted training data
        * Calculating outlier score (Before backward pass)
        * Metrics on the test of the model
    * Query user on results
        * Determine whether data should be cleaned, removed or retained
    * Adjustments to the model
    * User feedback in the loop
    * Supervised by human but missing ground truth label
4. Stream phase (Not included in the prototype)
    * **Main use:** Crowdsourcing sites, streams of information etc.
    * Results are accessible upon user request
    * Extra queries to identify label flipping issues

In [147]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [148]:
# 0. Defining libraries and parameters

# For plotting images
%matplotlib inline
%config InlineBackend.figure_format = "retina"

import matplotlib.pyplot as plt

# For defining torch libraries
import torch
from torch import nn, optim
from torchvision import datasets, transforms, models
from torch.utils.data import Dataset

# For defining custom dataset
from PIL import Image
import os, shutil, sys
import pickle
import copy

# device
device = 'cpu'

# Image size
size = 64

# Helper function: outputs the metric on screen
def print_metrics(epoch, batch, batch_size, loss, accuracy):
    print('======== Metrics ========\n\
[Epoch {}]\n \
Batch {} / {}\n \
Loss = {:.5f}\n \
Accuracy = {:.2f}%'.format(epoch, batch, batch_size, loss, acc))
    print('=========================')

# Helper function: flatten
# Flattens a nested list into a 1-D list
def flatten(nested_list):
    output = []
    for item in nested_list:
        if (type(item) == list):
            nested = flatten(item)
            for item in nested:
                output.append(item)
        else:
            output.append(item)

    return output

def get_accuracy(logits, labels):
    _, pred = torch.max(logits, 1)
    return (labels == pred).sum().item() / labels.shape[0] * 100

###### New Dataset class to be used in 3.1 ######

# Extending datasets.Dataset to create class UntrainedDataset
# Contains 2 functions for resolving label flipping and invalid data
class UntrustedDataset(Dataset):
    def __init__(self, dir, transform):
        self.dir = dir
        self.transform = transform
        self.data = {}
        self.folders = []

        for i, val in enumerate(os.listdir(dir)):
            folder_path = os.path.join(dir, val)
            if not os.path.isfile(folder_path):
                self.folders.append(folder_path)

        i = 0
        for f, folder in enumerate(self.folders):
            for val in os.listdir(folder):
                img_path = os.path.join(folder, val)
                if os.path.isfile(img_path):
                    _, file_extension = os.path.splitext(val)
                    if file_extension in ['.jpg', '.jpeg']:
                        self.data[i] = (img_path, f)
                        i += 1
                        # print(self.data[i])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        img_path, label = self.data[index]
        image = Image.open(img_path)
        image = self.transform(image)

        index = torch.LongTensor([index])
        label = torch.LongTensor([label])
        return index, image, label

    # Flips the label of the image
    # But does not move the image to the correct directory
    def flip_label(self, i, label):
        val = self.data[i][0]
        self.data[i] = (val, label)


    # Remove the invalid data by putting it in a unused directory
    def remove_invalid_data(self, i):
        self.data[i], self.data[len(self.data) - 1] = self.data[len(self.data) - 1], self.data[i]
        self.data.pop(len(self.data) - 1)

print('Libraries and helper functions successfully defined')

Libraries and helper functions successfully defined


In [158]:
# 1.1 Image Loader

# Define the data directory
directory = '/content/drive/MyDrive/Colab Notebooks/'
data_dir = os.path.join(directory, 'Cat_Dog_data/')
model_dir = os.path.join(directory, 'models')
train_data_dir = data_dir + 'train'
test_data_dir = data_dir + 'test'
untrusted_data_dir = data_dir + 'untrusted_train'

# Define the transform to be applied on all photos
transform = transforms.Compose([transforms.Resize(size),
                                transforms.CenterCrop(size),
                                transforms.ToTensor()])

# Load the directory into the ImageFolder and apply the transform
train_dataset = datasets.ImageFolder(train_data_dir, transform)
test_dataset = datasets.ImageFolder(test_data_dir, transform)
untrusted_dataset = UntrustedDataset(untrusted_data_dir, transform) # Note: Uses custom dataset

# Load a batch of 32 images from dataset, the images are randomly shuffled to avoid introduction of strange artefacts
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)
untrusted_dataloader = torch.utils.data.DataLoader(untrusted_dataset, batch_size=32, shuffle=True)

print('Dataloaders successfully initialized')

# print(model_directory)
# # Test: Converting dataloader (generator) to iterator and using the function next to get the images and labels
# images, labels = next(iter(train_dataloader))
# # Display the image and permute the color channels
# for i in range(3):
#   print(labels[i])
#   plt.figure()
#   plt.imshow(images[i].permute(1,2,0))
#   plt.show()

Dataloaders successfully initialized


In [150]:
# 1.2. Construct Model

# model_pretrained = models.resnet18(pretrained=True)

# The following code allows you to build a customized model
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()

        # you can define each operator with this kind of syntax
        self.input_size = input_size
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2, padding=1)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
        self.flatten = nn.Flatten()
        self.out = nn.Linear(input_size * input_size * 128 // 64, 2)

    # x: <batch-size = N>*3*224*224 images
    # output: logits, N*2 vector, each value ranging from (-inf, inf)
    def forward(self, x):
      
        # One type, define all process one by one
        # this allows branching
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.relu(x)
        x = self.flatten(x)
        logits = self.out(x)

        return logits

# create the model
model = BinaryClassifier(input_size = size)

###### Unused junk below ######

# model = nn.Sequential(
#     nn.Conv2d(3, 64, 3, padding=1, stride=2),
#     nn.ReLU(),
#     nn.Conv2d(64, 128, 3, padding=1, stride=2),
#     nn.ReLU(),
#     nn.Flatten(),
#     nn.Linear(32768, 1000),
#     nn.ReLU(),
#     nn.Linear(1000,2),
#     nn.LogSoftmax(1)
# )

# model = model_pretrained

print('Model object successfully created')

Model object successfully created


In [151]:
# 1.3. Define hyperparameters
# Before training, we want to define the hyperparameters first

# learning rate, if you use Adam optimizer, it is 0.01 by default
learn_rate = 0.001
# optimizer
optimizer = optim.Adam(model.parameters(), lr=learn_rate)

print('Hyperparameters successfully initialized')

Hyperparameters successfully initialized


In [None]:
# 2.1. Basic training loop for pretraining model

# Pass the model to GPU
# if using CPU, this line has no effect
model = model.to(device)

# Print the training results every 2 batches
print_every = 2
# Save the state_dict into model.pth every 2 epochs
save_every = 2

# loss function, for CLASSIFICATION tasks, use cross-entropy loss
# if the input is logits, in range (-inf, inf), use this one
loss_func = nn.CrossEntropyLoss()

# number of times to read through the entire dataset
num_epoch = 10

for epoch in range(num_epoch):   

    # this for loop iterates through the whole dataset
    for i, (images, labels) in enumerate(train_dataloader):

        # plt.figure()
        # plt.imshow(images[0].permute(1,2,0))
        # plt.show()

        # if using CPU, these two lines have no effect
        images = images.to(device)
        labels = labels.to(device)

        # prepare optimizer to perform model update
        # MUST BE CALLED BEFORE step()
        optimizer.zero_grad()

        # forward pass: compute the logits (prediction)
        logits = model(images)

        # print(logits)
        # print(labels)

        # calculate loss, input should be (predicted, actual)
        loss = loss_func(logits, labels)

        # backward pass: compute gradient and store in the model
        loss.backward()

        # perform model update
        optimizer.step()

        # log the progress
        if i % print_every == 0:
            _, pred = torch.max(logits, 1)
            acc = (labels == pred).sum().item() / images.shape[0] * 100
            print_metrics(epoch + 1, i + 1, len(train_dataloader), loss.item(), acc)

    # to save the model periodically, read for torch.save()
    if epoch % save_every == 1:
        torch.save(model.state_dict(), os.path.join(model_dir, 'BinClassifier{}.pth'.format(epoch + 1)))

print('Model is successfully trained')

In [155]:
# 3.1. Define new methods to be used in next training phase

# Determine whether the data is an outlier
# Input: logits, labels
# Output: problematic(set)
# If the datum has a L1 loss of larger than 0.75
# OR has a one-hot score > 0
# return the index of the datum to a set 'problematic'
def outlier_detector(logits, labels):
    
    threshold = 1.2

    # L1 loss > 0.75
    prob = nn.functional.softmax(logits)
    labels = torch.nn.functional.one_hot(labels, num_classes=2)
    L1_loss = nn.functional.l1_loss(prob, labels, reduction='none')
    L1_loss = torch.sum(L1_loss, dim=-1)
    problematic = set(flatten((L1_loss > threshold).squeeze().nonzero().tolist()))

    # Product of one-hot score > 0
    std, mean = torch.std_mean(logits)
    score = (logits - mean) / std
    indices = score[:, 0] * score[:, 1]
    # print(indices > 0)
    # print(type(problematic))
    problematic.update(flatten((indices > 0).squeeze().nonzero().tolist()))

    return problematic

# outputting the results and enabling the user to input
def feedback_problem_data(problem_idx, images, index, logits, labels):

    problem_data = nn.functional.softmax(logits)[list(problem_idx)]
    invalid_count = []

    for count, i in enumerate(problem_idx):

        output_image = images[i].to(device)

        # Show image
        print('Image {} / {}'.format(count+1, len(problem_idx)))
        plt.figure()
        plt.imshow(output_image.permute(1,2,0))
        plt.show()

        # Prompt the user to feedback
        print('The model thinks that it is {0:.2f}% likely to be a cat and it is {1:.2f}% likely to be a dog.'.format(problem_data[count][0].item() * 100, problem_data[count][1].item() * 100))
        print('1. Cat\t 2. Dog\t 3. Neither')

        feedback = 0
        while True:
            try:
                feedback = int(input('Please tell us what the image shows (1-3): '))
                if (feedback >= 1 and feedback <= 3):
                    break
            except KeyboardInterrupt:
                sys.exit('KeyboardInterrupt, stop running!')
            except:
                print('Invalid input, please try again.')


        # If it is either a cat or a dog, feedback the ground truth label to the model
        if (feedback <= 2):
            feedback -= 1
            gt_label = torch.tensor(feedback)
            if labels[i].item() != gt_label.item():
                  untrusted_dataset.flip_label(index[i].item(), gt_label)

        # If it is neither, then it is an invalid data, remove it from training!
        if (feedback == 3):
            untrusted_dataset.remove_invalid_data(index[i].item())
            invalid_count.append(i)

    problem_idx = problem_idx.difference(invalid_count)

    return problem_idx

print('Helper functions are successfully defined')

Helper functions are successfully defined


In [159]:
# 3.2. Training under human supervision
# Some of the data here is unclean, and there is no ground-truth label provided
# However, the human serves as an input to provide feedback for the model

# Loading the trained model from last phase
model_path = os.path.join(model_dir, 'BinClassifier8.pth')
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)

# Define loss function
loss_func = nn.CrossEntropyLoss(reduction='none')

# Saving the model every 2 epochs
save_every = 1
num_epoch = 10
train_epoch = 0
print_every = 10
feedback_every = 8

for epoch in range(num_epoch):

    # Redefine the dataloader after removing photos
    # Utilize the deep copy of the dataset to load images
    temp = copy.deepcopy(untrusted_dataset)
    untrusted_dataloader = torch.utils.data.DataLoader(temp, batch_size=32, shuffle=True)

    # this for loop iterates through the whole dataset
    for i, (index, images, labels) in enumerate(untrusted_dataloader):

        batch_size = images.shape[0]
        index = index.squeeze()
        labels = labels.squeeze()

        # if using CPU, these two lines have no effect
        images = images.to(device)
        labels = labels.to(device)

        # prepare optimizer to perform model update
        # MUST BE CALLED BEFORE step()
        optimizer.zero_grad()

        # forward pass: compute the logits (prediction)
        logits = model(images)

        if epoch < train_epoch and i % feedback_every == 0:
            # Check for any outliers right here using outlier_detector(logits, labels)
            problem_idx = outlier_detector(logits, labels)
            # print(problem_idx)
        else:
            problem_idx = set()

        # calculate loss, input should be (predicted, actual)
        loss = loss_func(logits, labels) # 32 * 1 tensor

        # Set loss of prob_idx to 0
        loss[list(problem_idx)] = torch.zeros(1).to(device)

        # Divide by the num of clean data, then sum then up
        loss = loss / (images.shape[0] - len(problem_idx))
        loss = torch.sum(loss)

        # backward pass: compute gradient and store in the model
        loss.backward()

        # perform model update
        optimizer.step()

        if epoch < train_epoch and i % feedback_every == 0:

            if len(problem_idx) > 0:

                # Log the progress
                # created Boolean mask to filter the problematic data
                problem_idx_bool_tensor = torch.ones(batch_size, dtype=torch.bool)
                problem_idx_bool_tensor[list(problem_idx)] = False

                trimmed_logits = torch.Tensor(batch_size - len(problem_idx), 2).to(device)
                trimmed_logits[:,0] = torch.masked_select(logits[:,0], problem_idx_bool_tensor)
                trimmed_logits[:,1] = torch.masked_select(logits[:,1], problem_idx_bool_tensor)

                # print(trimmed_logits)
                _, pred = torch.max(trimmed_logits, 1)
                # print(pred)
                # print(torch.masked_select(labels, problem_idx_bool_tensor))
                batch_correct = (torch.masked_select(labels, problem_idx_bool_tensor) == pred).sum().item()
                batch_num = images.shape[0] - len(problem_idx)
                batch_acc = batch_correct / batch_num * 100
                print_metrics(epoch + 1, i + 1, len(untrusted_dataloader), loss.item(), batch_acc)

                # Output the data to the human user
                problem_idx = feedback_problem_data(problem_idx, images, index, logits, labels)

            else:
                print('This batch contains no problematic data :)')
                entire_batch_acc = get_accuracy(logits, labels)
                acc = (entire_batch_acc + acc * i) / (i + 1)
                print_metrics(epoch + 1, i + 1, len(untrusted_dataloader), loss.item(), acc)
                continue

            
            if len(problem_idx) > 0:
                # train the model with the fixed labels (if the data is valid)
                fixed_images = images[list(problem_idx)]
                fixed_labels = labels[list(problem_idx)]

                fixed_logits = model(fixed_images)

                loss = loss_func(fixed_logits, fixed_labels)
                loss = torch.mean(loss)

                loss.backward()

                optimizer.step()

                # Calculate the accuracy
                _, fixed_pred = torch.max(fixed_logits, 1)
                fixed_correct = (fixed_labels == fixed_pred).sum().item()
                # batch_acc = fixed_correct / fixed_images.shape[0] * 100
                
                entire_batch_acc = (batch_correct + fixed_correct) / (batch_num + fixed_images.shape[0]) * 100
                acc = (entire_batch_acc + acc * i) / (i + 1)
                
                print('====== After feedback ======')
                print_metrics(epoch + 1, i + 1, len(untrusted_dataloader), loss.item(), acc)

            else:
                entire_batch_acc = batch_acc
                acc = (entire_batch_acc + acc * i) / (i + 1)
                print('This batch has no more data to train!')
                print_metrics(epoch + 1, i + 1, len(untrusted_dataloader), loss.item(), acc)

        else:
            entire_batch_acc = get_accuracy(logits, labels)
            acc = (entire_batch_acc + acc * i) / (i + 1)
            if i % print_every == 0:
                print_metrics(epoch + 1, i + 1, len(untrusted_dataloader), loss.item(), acc)


    # to save the model periodically, read for torch.save()
    if epoch % save_every == 0:
        torch.save(model.state_dict(), os.path.join(model_dir, 'Control_BinClassifier{}.pth'.format(epoch + 1)))
        if train_epoch > 0:
            with open(os.path.join(directory, 'CleanedDataset3'),'wb') as filehandler:
                pickle.dump(untrusted_dataset, filehandler)
            
print('Model is successfully trained')

[Epoch 1]
 Batch 1 / 461
 Loss = 0.96505
 Accuracy = 56.25%
[Epoch 1]
 Batch 11 / 461
 Loss = 0.81899
 Accuracy = 67.05%
[Epoch 1]
 Batch 21 / 461
 Loss = 0.61421
 Accuracy = 68.60%
[Epoch 1]
 Batch 31 / 461
 Loss = 0.65720
 Accuracy = 67.44%
[Epoch 1]
 Batch 41 / 461
 Loss = 0.84204
 Accuracy = 67.23%
[Epoch 1]
 Batch 51 / 461
 Loss = 0.66352
 Accuracy = 67.89%
[Epoch 1]
 Batch 61 / 461
 Loss = 0.56078
 Accuracy = 67.88%
[Epoch 1]
 Batch 71 / 461
 Loss = 0.59619
 Accuracy = 67.91%
[Epoch 1]
 Batch 81 / 461
 Loss = 0.60725
 Accuracy = 67.94%
[Epoch 1]
 Batch 91 / 461
 Loss = 0.59745
 Accuracy = 68.17%
[Epoch 1]
 Batch 101 / 461
 Loss = 0.48832
 Accuracy = 68.35%
[Epoch 1]
 Batch 111 / 461
 Loss = 0.58184
 Accuracy = 68.30%
[Epoch 1]
 Batch 121 / 461
 Loss = 0.55484
 Accuracy = 68.49%
[Epoch 1]
 Batch 131 / 461
 Loss = 0.59610
 Accuracy = 68.46%
[Epoch 1]
 Batch 141 / 461
 Loss = 0.55963
 Accuracy = 68.68%
[Epoch 1]
 Batch 151 / 461
 Loss = 0.55844
 Accuracy = 68.89%
[Epoch 1]
 Batch 16

In [142]:
# 3.5 Loading the cleaned dataset into the model training
with open(os.path.join(directory, 'CleanedDataset3'),'rb') as filehandler:
    untrusted_dataset = pickle.load(filehandler)

# untrusted_dataloader = torch.utils.data.DataLoader(untrusted_dataset, batch_size=32, shuffle=True)

# i = 0
# for index, images, labels in untrusted_dataloader:
#     print(str(i),'\t\t',images.shape)
#     print('\t\t',labels.shape)
#     print('\t\t',index.shape)
#     i += 1

print('Dataset loaded successfully')

Dataset loaded successfully


In [162]:
# 4. Testing phase

for j in range(10):

    # Load the model to be tested
    model_path = os.path.join(model_dir, 'Control_BinClassifier{}.pth'.format(j+1)) # Insert path of .pth file here
    model.load_state_dict(torch.load(model_path, map_location=device), )
    model = model.to(device)
    
    # Define loss function
    loss_func = nn.CrossEntropyLoss()
    acc = 0

    print('Now testing {}...'.format(model_path))

    for i, (images, labels) in enumerate(test_dataloader):
        
        images = images.to(device)
        labels = labels.to(device)

        logits = model(images)

        loss = loss_func(logits, labels)

        _, pred = torch.max(logits, 1)
        batch_acc = (labels == pred).sum().item() / images.shape[0] * 100
        acc = (batch_acc + acc * i) / (i + 1)
        
        if i == len(test_dataloader) - 1:
            print_metrics(1, i + 1, len(test_dataloader), loss.item(), acc)

print('Models are successfully tested')

Now testing /content/drive/MyDrive/Colab Notebooks/models/Control_BinClassifier1.pth...
[Epoch 1]
 Batch 79 / 79
 Loss = 0.54115
 Accuracy = 72.43%
Now testing /content/drive/MyDrive/Colab Notebooks/models/Control_BinClassifier2.pth...
[Epoch 1]
 Batch 79 / 79
 Loss = 1.16385
 Accuracy = 71.99%
Now testing /content/drive/MyDrive/Colab Notebooks/models/Control_BinClassifier3.pth...
[Epoch 1]
 Batch 79 / 79
 Loss = 0.44612
 Accuracy = 75.71%
Now testing /content/drive/MyDrive/Colab Notebooks/models/Control_BinClassifier4.pth...
[Epoch 1]
 Batch 79 / 79
 Loss = 0.24130
 Accuracy = 74.96%
Now testing /content/drive/MyDrive/Colab Notebooks/models/Control_BinClassifier5.pth...
[Epoch 1]
 Batch 79 / 79
 Loss = 0.43748
 Accuracy = 75.67%
Now testing /content/drive/MyDrive/Colab Notebooks/models/Control_BinClassifier6.pth...
[Epoch 1]
 Batch 79 / 79
 Loss = 0.34509
 Accuracy = 75.67%
Now testing /content/drive/MyDrive/Colab Notebooks/models/Control_BinClassifier7.pth...
[Epoch 1]
 Batch 79 / 79