Unrestricted Image Description Dataset
=====================

In [33]:
import os
import sys
import subprocess
import csv
import re
import collections

rawDir = 'rawData/'
tmpRawFilePrefix = 'rawData/raw'

In [28]:
# Transforming raw file from Mac OS style to *nix style
allRawFiles = sorted([os.path.join(rawDir,f) for f in os.listdir(rawDir) if os.path.isfile(os.path.join(rawDir,f))])
tmpRawFiles = []
for i, f in enumerate(allRawFiles):
    with open(f) as fin:
        tmpRaw = tmpRawFilePrefix+str(i)+'.csv'
        tmpRawFiles.append(tmpRaw)
        with open(tmpRaw, 'w') as fout:
            subprocess.call(['tr', '\\r', '\\n'], stdin=fin, stdout=fout)

In [29]:
# Collecting raw inputs into one table
rawDesc = [] # list of tuples of (image_hash, image_category_id, description)
rawIdentity = [] # list of tuples of (image_hash_1, image_category_id_1, image_hash_2, image_category_id_2, is_same)
headers = None
urlFormat = re.compile('^http://stanford.edu/~mrgreene/SCENE_CATEGORIES/\d+/\w*\.\w+$')
for f in tmpRawFiles:
    with open(f) as fin:
        csvReader = csv.reader(fin)
        firstRow = True
        for row in csvReader:
            if firstRow:
                firstRow = False
                if headers is None:
                    headers = row
                else:
                    if headers != row:
                        raise RuntimeError('Headers does not match!')
            else:
                if len(row) != len(headers):
                    raise RuntimeError('Row corrupted!')
                imagePresented = {}
                identityAnswer = {}
                descAnswer = {}
                for h, c in zip(headers, row):
                    if h.startswith('Input.image_'):
                        imageId = int(h[len('Input.image_'):])
                        if urlFormat.match(c) is None:
                            print c, row
                            raise RuntimeError('Do not understand url.')
                        c = c[len('http://stanford.edu/~mrgreene/SCENE_CATEGORIES/'):].split('/')
                        imagePresented[imageId] = (c[1], int(c[0]))
                    elif h.startswith('Answer.category_name'):
                        if c != '':
                            ansId = int(h[len('Answer.category_name'):])
                            descAnswer[ansId] = c
                    elif h.startswith('Answer.category'):
                        if c != '':
                            ansId = int(h[len('Answer.category'):])
                            if c != 'same' and c != 'different':
                                print c, row
                                raise RuntimeError('Not same nor different.')
                            identityAnswer[ansId] = (c == 'same')
                for i, ans in identityAnswer.iteritems():
                    rawIdentity.append(imagePresented[i*2-1] + imagePresented[i*2] + (ans,))
                for i, ans in descAnswer.iteritems():
                    rawDesc.append(imagePresented[i*2-1]+(ans, ))

Note the special URL of http://stanford.edu/~mrgreene/SCENE_CATEGORIES/141/.jpg

In [31]:
# Save the output in one file
with open('rawDesc.csv', 'w') as fout:
    csvWriter = csv.writer(fout)
    csvWriter.writerow(['image_filename', 'image_category_id', 'description'])
    for data in rawDesc:
        csvWriter.writerow(data)
with open('rawIdentity.csv', 'w') as fout:
    csvWriter = csv.writer(fout)
    csvWriter.writerow(['image_filename_1', 'image_category_id_1',
                         'image_filename_2', 'image_category_id_2', 'is_same'])
    for data in rawIdentity:
        csvWriter.writerow(data)