In [None]:
%matplotlib inline

### Image Quality Assessment
-----------------------------------
#### Steps:
**1:** Read in all the images from the server.

**2:** Break the images into smaller categories. This can be accomplished via a K-Means classifier or possibly by running the images through a different CNN beforehand. These categories are then stored and used later. 

**3:** Determine labels based on the metadata stored in the pictures. For many of the 2017 pictures there is exif, IPTC, and XMP. The differences are as follows:

* exif - Stored by camera at the time of the picture. This includes GPS location, pixel count, zoom, etc.
* IPTC - Metadata added via an editor. This is generally an older bulk metadata system. While still used it is getting phased out by XMP.
* XMP - Metadata that can be applied as an "xml"-like file or added in headers and footers of images. It is more clearly defined than IPTC and significantly newer. It also allows for more metadata to be stored. 

For both IPTC and XMP, they contain information we can use as labels. We have preference tags (code 221) where information has been stored determining which pictures have been preferenced before our work began. In addition to this, the Reynolds Journalism School has informed us that they have color classed about half of the images via PhotoMechanic which is stored in the IPTC and XMP data. These color classes correspond to a quality assessment on a rough range of 0-9. Once labels have been determined we can begin training.

**4:** Import VGG-16. This is a very successful pretrained classification model that has shown promise in previous studies attempting similar techniques as us, most notibly NIMA (Neural IMage Assessment) by the perceptron team at Google.AI in 2015.

**5:** Once the pretrained model has been imported we can pull the fully connected component off the top and replace it with our own. Initially we will replace it with one total classifier to determine quality but after proof of concept has been established we can supply two different classifiers (1 for technical assessment and 1 for aesthetics). These classifiers shall be constant for each category of picture while just changing the loaded weights for the system. This allows us to adjust our idea of what quality is based on the category it is in. 

**6:** After the models have provided a technical and aesthetic score we can use the Choquet Integral to aggregate the data together and get a measurement of uncertainty. This shall provide us with a rating and a range both of which can be used to rank photos in a set. 

In [None]:
#standard ML/Image Processing imports
import numpy as np
import math, pandas
import matplotlib.image as mpimg

from PIL import Image

#pytorch imports
import torch
import torchvision.models as models

from torch import nn
from torch import optim
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets, transforms


home = "../../../../../mnt/md0/mysql-dump-economists/Archives"#/Fall"#/Dump"
vgg16 = models.vgg16(pretrained=True) 

_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
    )
])


At this point, we have created all our global variables necessary for the classifier and imported all necessary libraries. Next we need to determine the images under test. Once we have gotten access to GPUs we can scale this up. This class extends the ImageFolder and allows us to iterate through and pull all files from inside a home directory.

In [None]:
class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
        return tuple_with_path

We need to split the data into training and testing. This is accomplished using this function. The number of pictures used in the training/testing sets is set via the limit_num_pictures variable in this function. Set it to null if you don't want a limit. This will be accomplished once we have access to GPUs.

In [None]:
def load_split_train_test(datadir, valid_size = .2):
    
    # Helper/controller params for checking size
    find_size_bounds = False #set to true if you are looking for min/max dims in the current set, false if you want them to be resized
    limit_num_pictures = 1000 #set to null if you want no limit
    
    # load data and apply the transforms on contained pictures
    train_data = ImageFolderWithPaths(datadir, transform=_transform)
    test_data = ImageFolderWithPaths(datadir, transform=_transform)   
    
    maxh = 0
    minh = 10000
    maxw = 0
    minw = 10000
    if find_size_bounds:
        try:
            for (i, pic) in enumerate(train_data):
                #if we are limiting pics
                if limit_num_pictures:
                    if i > limit_num_pictures:
                        break
                print(pic[0].size())
                if pic[0].size()[1] > maxw:
                    maxw = pic[0].size()[1]
                elif pic[0].size()[1] < minw:
                    minw = pic[0].size()[1]

                if pic[0].size()[2] > maxh:
                    maxh = pic[0].size()[2]
                elif pic[0].size()[2] < minh:
                    minh = pic[0].size()[2]
        except Exception as e:
            print(e)
            print("error occurred on pic {} number {}".format(pic, i))
    
        print("Max/min width: {} {}".format(maxw, minw))
        print("Max/min height: {} {}".format(maxh, minh))
    
    num_pictures = len(train_data)
    print("Number of pictures in subdirectories: {}".format(num_pictures))
    
    # Shuffle pictures and split training set
    indices = list(range(num_pictures))
    print("Head of indices: {}".format(indices[:10]))
    
    split = int(np.floor(valid_size * num_pictures))
    print("Split index: {}".format(split))
    
    # may be unnecessary with the choice of sampler below
#     np.random.shuffle(indices)
#     print("Head of shuffled indices: {}".format(indices[:10]))
    
    train_idx, test_idx = indices[split:], indices[:split]
    print("Size of training set: {}, size of test set: {}".format(len(train_idx), len(test_idx)))
    
    # Define samplers that sample elements randomly without replacement
    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(test_idx)
    
    # Define data loaders, which allow batching the data, shuffling the data, and 
    #     loading the data in parallel using multiprocessing workers
    trainloader = torch.utils.data.DataLoader(train_data,
                   sampler=train_sampler, batch_size=1)#, num_workers=4)
    testloader = torch.utils.data.DataLoader(test_data,
                   sampler=test_sampler, batch_size=1)#, num_workers=4)
    return trainloader, testloader

Now that we have all the dataset loader helper functions we can run the files. We can begin processing. The function run_k_means_files() splits the data for K-Means processing, otherwise we need to run it against the VGG16 model.

In [None]:
def run_k_means_files():
    count = 0
    limit = 150
    print("Percent done: {}%".format(count/limit*100))
    for inputs, labels, paths in trainloader:
        print('\n{}'.format(paths))
        print(inputs.size())
        mat4d = inputs
        mat4d = mat4d[0::2,0::2,:,:]
        mat2d = mat4d.resize_((mat4d.shape[1] * mat4d.shape[2]), mat4d.shape[3])
        
        add_to_list_file(paths[0], mat2d)
        count = count + 1
        print("Percent done: {}%".format(count/limit*100))
        if(count >= limit):
            break

In [None]:
def run_vgg():
    training, testing = load_split_train_test(home, .2)
    vgg16.eval()
    output = vgg16(training)
    print(output)

The final cell is running the correct machine. Comment out the one you don't want to run.

In [None]:
run_vgg()
# run_k_means_files()