### Classifying Images into Subgroups

In this file we shall attempt to classify images into subgroups. This is intended to help rank the images more effectively against other similar images. This would be added to the front of any ranking pipeline. Because of the labels provided we are only able to train with the AVA dataset.

In [1]:
'''
SCRIPT IMPORTS
'''
#standard ML/Image Processing imports
import numpy as np
import pandas as pd
import math, pandas
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

from PIL import Image

#pytorch imports
import torch
import torch.optim as optim
import torchvision.models as models

from torch import nn
from torch import optim
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets, transforms

# no one likes irrelevant warnings
import warnings  
warnings.filterwarnings('ignore')

# we load the pretrained model, the argument pretrained=True implies to load the ImageNet weights for the pre-trained model
vgg16 = models.vgg16(pretrained=True)

# root directory where the images are stored
data_dir = "/mnt/md0/reynolds/ava-dataset/"
label_file = "/mnt/md0/reynolds/ava-dataset/AVA_dataset/AVA.txt"
tags_file = "/mnt/md0/reynolds/ava-dataset/AVA_dataset/tags.txt"
limit_lines = 1000000

In [2]:
tag_mapping = {}
f = open(tags_file, "r")
for i, line in enumerate(f):
    if i >= limit_lines:
        break
    line_array = line.split()
    tag_mapping[int(line_array[0])] = line_array[1]
tag_mapping[0] = "Miscellaneous"
print(tag_mapping)

{1: 'Abstract', 24: 'Action', 31: 'Advertisement', 66: 'Analog', 19: 'Animals', 20: 'Architecture', 43: 'Astrophotography', 57: 'Birds', 21: 'Black', 51: 'Blur', 64: 'Camera', 16: 'Candid', 50: 'Children', 2: 'Cityscape', 34: 'Digital', 37: 'Diptych', 49: 'DPChallenge', 12: 'Emotive', 4: 'Family', 3: 'Fashion', 63: 'Fish', 38: 'Floral', 40: 'Food', 53: 'High', 45: 'History', 58: 'Horror', 5: 'Humorous', 46: 'Infrared', 65: 'Insects,', 6: 'Interior', 14: 'Landscape', 62: 'Lensbaby', 22: 'Macro', 56: 'Maternity', 44: 'Military', 59: 'Music', 15: 'Nature', 26: 'Nude', 55: 'Overlays', 33: 'Panoramic', 13: 'Performance', 32: 'Persuasive', 52: 'Photo-Impressionism', 25: 'Photojournalism', 60: 'Pinhole/Zone', 30: 'Political', 17: 'Portraiture', 27: 'Rural', 41: 'Science', 35: 'Seascapes', 47: 'Self', 7: 'Sky', 8: 'Snapshot', 9: 'Sports', 18: 'Still', 61: 'Street', 29: 'Studio', 54: 'Texture', 48: 'Textures', 36: 'Traditional', 39: 'Transportation', 23: 'Travel', 10: 'Urban', 11: 'Vintage', 28

In [3]:
pic_label_dict = {}
f = open(label_file, "r")
for i, line in enumerate(f):
    if i >= limit_lines:
        break
    line_array = line.split()
#     print(line_array)
    picture_name = line_array[1]
    # print(picture_name)
    classifications = (line_array[12:])[:-1]
#     print(classifications)
    for i in range(0, len(classifications)): 
#         classifications[i] = tag_mapping[int(classifications[i])]
        classifications[i] = int(classifications[i])
    # print(max(classifications))
    pic_label_dict[picture_name] = classifications
print(pic_label_dict)

{'953619': [1, 22], '953958': [1, 21], '954184': [0, 0], '954113': [15, 21], '953980': [22, 38], '954175': [15, 65], '953349': [16, 21], '953645': [0, 0], '953897': [7, 14], '953841': [14, 53], '953417': [22, 0], '953777': [20, 53], '953756': [0, 0], '954195': [0, 0], '953903': [21, 28], '954222': [9, 24], '953889': [0, 0], '953844': [15, 19], '954104': [0, 0], '954229': [0, 0], '953550': [14, 0], '953726': [0, 0], '954228': [0, 0], '953750': [15, 19], '954181': [15, 57], '954208': [10, 62], '953810': [21, 57], '954187': [38, 0], '953621': [7, 16], '953348': [0, 0], '953283': [0, 0], '953092': [27, 50], '953751': [15, 53], '954112': [10, 15], '954117': [0, 0], '954116': [20, 64], '953821': [21, 50], '954105': [21, 0], '953019': [12, 47], '953780': [0, 0], '954186': [65, 0], '954119': [0, 0], '954063': [19, 0], '954121': [20, 35], '954066': [1, 38], '954125': [14, 15], '954067': [0, 0], '953582': [0, 0], '954013': [2, 14], '954069': [2, 12], '954071': [17, 0], '954225': [20, 0], '954124

In [4]:
class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
#         print(tuple_with_path)
        return tuple_with_path

In [5]:
_transform = transforms.Compose([transforms.ToTensor()])

data = ImageFolderWithPaths(data_dir, transform=_transform)

data_loader = torch.utils.data.DataLoader(data)#, num_workers=4)

limit_num_pictures = 1000000

# Define our data transforms to get all our images the same size
_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

valid_size = 0.2 # percentage of data to use for test set

In [6]:
test_data = data
train_data = data
num_pictures = len(train_data)

# Shuffle pictures and split training set
indices = list(range(num_pictures))

split = int(np.floor(valid_size * num_pictures))
train_idx, test_idx = indices[split:], indices[:split]#rated_indices, bad_indices
#print("Size of training set: {}, size of test set: {}".format(len(train_idx), len(test_idx)))

# Define samplers that sample elements randomly without replacement
train_sampler = SubsetRandomSampler(train_idx)
test_sampler = SubsetRandomSampler(test_idx)

# Define data loaders, which allow batching and shuffling the data
train_loader = torch.utils.data.DataLoader(train_data,
               sampler=train_sampler, batch_size=1)#, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_data,
               sampler=test_sampler, batch_size=1)#, num_workers=4)

# check GPU availability
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# we load the pretrained model, the argument pretrained=True implies to load the ImageNet 
#     weights for the pre-trained model
vgg16 = models.vgg16(pretrained=True)
vgg16.to(device) # loads the model onto the device (CPU or GPU)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [7]:
for param in vgg16.parameters():
    param.requires_grad = False #freeze all convolution weights
network = list(vgg16.classifier.children())[:-1] #remove fully connected layer
network.extend([nn.Linear(4096, 67)]) #add new layer of 4096->100 (rating scale with 1 decimal - similar to 1 hot encoding)
vgg16.classifier = nn.Sequential(*network)

criterion = nn.CrossEntropyLoss() # loss function
optimizer = optim.SGD(vgg16.parameters(), lr=0.4, momentum=0.9) # optimizer

In [10]:
vgg16.train() # set model to training model
num_epochs = 3 
training_loss = 0
training_accuracy = 0
for epoch in range(num_epochs):
    running_loss = 0.0
    num_correct = 0
    for i, data in enumerate(train_loader,0):
        #print(i)
        for j in range(2): #done so we can utilize both labels
            if limit_num_pictures:
                if i > limit_num_pictures:
                    break
            inputs, _, path = data
            path = path[0]
            path_array = path.split('/')
            pic_name = path_array[-1]
    #         print(pic_name)
    #         print(pic_label_dict[pic_name.split('.')[0]])
    #         label = torch.LongTensor(pic_label_dict[pic_name.split('.')[0]])
            label = pic_label_dict[pic_name.split('.')[0]][j]
#             print(tag_mapping[label])
            label = torch.LongTensor([label])
#             print('inputs shape is: {}'.format(inputs.shape))
#             print('label shape is: {}'.format(label.shape))
#             print('label is : {}'.format(label))
            optimizer.zero_grad()
            output = vgg16(inputs)
    #         print('output shape is: {}'.format(output.shape))
    #         print(output, label)
            loss = criterion(output, label)
            running_loss += loss.item()
            _, preds = torch.max(output.data, 1)
            num_correct += (preds == pic_label_dict[pic_name.split('.')[0]][j]).sum().item()
            loss.backward()
            optimizer.step()
    training_loss = running_loss/len(train_loader.dataset)
    training_accuracy = 100 * num_correct/len(train_loader.dataset)

Urban
inputs shape is: torch.Size([1, 3, 360, 540])
label shape is: torch.Size([1])
label is : tensor([10])
Architecture
inputs shape is: torch.Size([1, 3, 360, 540])
label shape is: torch.Size([1])
label is : tensor([20])
Miscellaneous
inputs shape is: torch.Size([1, 3, 640, 464])
label shape is: torch.Size([1])
label is : tensor([0])
Miscellaneous
inputs shape is: torch.Size([1, 3, 640, 464])
label shape is: torch.Size([1])
label is : tensor([0])
Landscape
inputs shape is: torch.Size([1, 3, 445, 640])
label shape is: torch.Size([1])
label is : tensor([14])
Candid
inputs shape is: torch.Size([1, 3, 445, 640])
label shape is: torch.Size([1])
label is : tensor([16])
Interior
inputs shape is: torch.Size([1, 3, 427, 640])
label shape is: torch.Size([1])
label is : tensor([6])
Science
inputs shape is: torch.Size([1, 3, 427, 640])
label shape is: torch.Size([1])
label is : tensor([41])
Portraiture
inputs shape is: torch.Size([1, 3, 600, 463])
label shape is: torch.Size([1])
label is : tenso

KeyboardInterrupt: 

In [None]:
torch.save(vgg16.state_dict(), 'models/CLASSIFICATION_Feb3_All_AVA_only_training.pt')