In [1]:
from os import listdir
from os.path import isfile, join, isdir
from PIL import Image
from scipy.misc import imread, imresize

import operator
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pickle
import numpy as np
from sklearn.utils import shuffle

%matplotlib inline


### Create dataset with patterns

In [64]:
#Merts patterns 

#animals
a = 5*[0]
a[0] =  "butterfly"
a[1] =  "cougar_body"
a[2] =  "elephant"
a[3] =  "llama"
a[4] =  "panda" 

#vehicles
v = 5*[0]
v[0] =  "airplanes"
v[1] =  "car_side"
v[2] =  "helicopter"
v[3] =  "Motorbikes"
v[4] =  "stop_sign"  



patterns = []
for i in range(5):
    pattern = 5 * [0]
    for j in range(4):
        pattern[j] = a[(i + j)%5]
    pattern[4] = pattern[0]
    patterns.append(pattern)
    

for i in range(5):
    pattern = 5 * [0]
    for j in range(4):
        pattern[j] = v[(i + j)%5]
    pattern[4] = pattern[0]
    patterns.append(pattern)
    
patterns

[['butterfly', 'cougar_body', 'elephant', 'llama', 'butterfly'],
 ['cougar_body', 'elephant', 'llama', 'panda', 'cougar_body'],
 ['elephant', 'llama', 'panda', 'butterfly', 'elephant'],
 ['llama', 'panda', 'butterfly', 'cougar_body', 'llama'],
 ['panda', 'butterfly', 'cougar_body', 'elephant', 'panda'],
 ['airplanes', 'car_side', 'helicopter', 'Motorbikes', 'airplanes'],
 ['car_side', 'helicopter', 'Motorbikes', 'stop_sign', 'car_side'],
 ['helicopter', 'Motorbikes', 'stop_sign', 'airplanes', 'helicopter'],
 ['Motorbikes', 'stop_sign', 'airplanes', 'car_side', 'Motorbikes'],
 ['stop_sign', 'airplanes', 'car_side', 'helicopter', 'stop_sign']]

In [60]:
path  = 'data/101_ObjectCategories/'

stories_per_pattern = 4000
negatives_ratio = 0.4


data = [] 
data_labels = []
pattern_labels = []

category2images = {category:[(category + "/" + image) for image in listdir(path + "/" + category) if isfile(path + "/" + category + "/" + image)] for category in listdir(path) if isdir(path + "/" + category)}

# patterns = [ ['airplanes', 'cars_brad', 'Faces', 'Motorbikes', 'Leopards']]

# patterns =  [['airplanes', 'cars_brad', 'Faces', 'Motorbikes', 'Leopards'],
#              ['bass', 'llama', 'mandolin', 'pizza', 'saxophone'],
#              ['strawberry', 'stapler', 'dalmatian', 'Motorbikes', 'camera']]



#CREATE TRUE EXAMPLES
for pattern_pos, pattern in enumerate(patterns):
    
    stories = np.empty((stories_per_pattern, 5), dtype = 'object_')
     
        
    for category_pos, category in enumerate(pattern):
        if category == "*" or "/" in category:

            if(category == "*"):
                categories = list(category2images.keys())
            else:
                categories = category.split("/")

            category_images = []
            sampled_categories = np.random.choice(np.asarray(categories), stories_per_pattern, replace = True)
            print(sampled_categories)
            for sampled_category in sampled_categories:
                image = np.random.choice(np.asarray(category2images[sampled_category]), 1)[0]
                category_images.append(image)

        else:
            category_images = np.random.choice(np.asarray(category2images[category]), stories_per_pattern, replace = True)
            
        stories[:,category_pos] = category_images.T
     
    data.extend(stories.tolist())
    pattern_labels.extend([pattern_pos] * stories_per_pattern)
    
data_labels = [1] * len(data)


#CREATE NEGATIVE EXAMPLES
negative_stories_indexes = np.random.choice(np.arange(len(data)), round(len(data) * negatives_ratio), replace = False)
for negative_story_idx in negative_stories_indexes:
    
    true_ending_category = patterns[pattern_labels[negative_story_idx]][-1]
    false_categories = list(category2images.keys())
    false_categories.remove(true_ending_category)
    
    false_ending_category = np.random.choice(np.asarray(false_categories), 1)[0]
    
    false_ending = np.random.choice(np.asarray(category2images[false_ending_category]), 1)[0]
    
    data[negative_story_idx][-1] = false_ending
    
    data_labels[negative_story_idx] = 0


data, data_labels, pattern_labels = shuffle(data, data_labels, pattern_labels)

In [61]:
 for i,story in enumerate(data[:10]):
    print("Story #%d" %i)
    print(" Pattern: %s" % patterns[pattern_labels[i]])
    print(" %s story" % ("True" if data_labels[i] else "False"))
    for c,image in enumerate(story):
        print(" %d.%s"%(c,image)) 
    
    print("-----------------------------") 

Story #0
 Pattern: ['stop_sign', 'airplanes', 'car_side', 'helicopter', 'stop_sign']
 False story
 0.stop_sign/image_0063.jpg
 1.airplanes/image_0734.jpg
 2.car_side/image_0099.jpg
 3.helicopter/image_0008.jpg
 4.butterfly/image_0010.jpg
-----------------------------
Story #1
 Pattern: ['Motorbikes', 'stop_sign', 'airplanes', 'car_side', 'Motorbikes']
 False story
 0.Motorbikes/image_0034.jpg
 1.stop_sign/image_0064.jpg
 2.airplanes/image_0395.jpg
 3.car_side/image_0027.jpg
 4.BACKGROUND_Google/image_0167.jpg
-----------------------------
Story #2
 Pattern: ['helicopter', 'Motorbikes', 'stop_sign', 'airplanes', 'helicopter']
 True story
 0.helicopter/image_0027.jpg
 1.Motorbikes/image_0207.jpg
 2.stop_sign/image_0045.jpg
 3.airplanes/image_0530.jpg
 4.helicopter/image_0071.jpg
-----------------------------
Story #3
 Pattern: ['helicopter', 'Motorbikes', 'stop_sign', 'airplanes', 'helicopter']
 True story
 0.helicopter/image_0008.jpg
 1.Motorbikes/image_0277.jpg
 2.stop_sign/image_0044.

In [62]:
training_size = 10000
validation_size = 1000
test_size = 1000

split_dataset = {}
split_dataset["data"] = {"train" : data[:training_size], "validation" : data[training_size:training_size + validation_size], "test" : data[training_size + validation_size:12000]}
split_dataset["labels"] = {"train" : data_labels[:training_size], "validation" : data_labels[training_size:training_size + validation_size], "test" : data_labels[training_size + validation_size:12000]}
split_dataset["pattern_labels"] = {"train" : pattern_labels[:training_size], "validation" : pattern_labels[training_size:training_size + validation_size], "test" : pattern_labels[training_size + validation_size:12000]}
split_dataset["patterns"] = patterns

with open("data/caltech_stories_patterns", 'wb') as output_file:
        pickle.dump(split_dataset, output_file)