# Dataset creation

In [1]:
import re
import pandas as pd
from collections import Counter
import json
import os
import shutil
from IPython.display import Image, display

First, we create the regular expressions to find the annotations of interest: 

In [20]:
# contains the ethicity attributes that appear 5 times or more in the flickr8k corpus
nonwhite_re = re.compile(r'(\ban?)? ?(african|asian|black|japanese|indian|oriental|middle[ -]eastern|dark[ -]skinned|african[ -]american) (girls?|boys?|m[ae]n|wom[ae]n|persons?|individuals?|people|guys?|dudes?|lad(y|ies)|kids?|child(ren)?|chicks?)\s')
# group 0: "an asian girl "
# group 1: "an"
# group 2: "asian"
# group 3: "girl"
white_re = re.compile(r'(\ban?)? ?\b((white|fair|pale)([\s-]skinned)?|caucasian) (girls?|boys?|m[ae]n|wom[ae]n|persons?|individuals?|people|guys?|dudes?|lad(y|ies)|kids?|child(ren)?|chicks?)\s')
# group 0: "a white boy "
# group 1: "a"
# group 2: "white"
# group 5: "boy" (because some extra groups to account for "fair-skinned" etc)
people_re = re.compile(r'(\ban?)?\s?\b(girls?|boys?|m[ae]n|wom[ae]n|persons?|individuals?|people|guys?|dudes?|lad(y|ies)|kids?|child(ren)?|chicks?)\s')
# group 0: " child " (line.split(group0))
# group 1: "" (a/an)
# group 2: "child" (just add "white")

Then we load the data (Flickr8k and COCO):

In [3]:
datadir = '/srv/data/gussodato/'

In [30]:
with open(os.path.join(datadir, 'flickr8k/Flickr8k.token.txt')) as f:
    lines = [line.lower().strip().split('\t') for line in f]

flickr_df = pd.DataFrame(lines)
img_id_col = [row[:-2] for row in flickr_df[0]]
flickr_df[2] = img_id_col
flickr_df[3] = 'flickr8k/Flicker8k_Dataset'

In [52]:
flickr_df

Unnamed: 0,0,1,2,3
0,1000268201_693b08cb0e.jpg#0,a child in a pink dress is climbing up a set o...,1000268201_693b08cb0e.jpg,flickr8k/Flicker8k_Dataset
1,1000268201_693b08cb0e.jpg#1,a girl going into a wooden building .,1000268201_693b08cb0e.jpg,flickr8k/Flicker8k_Dataset
2,1000268201_693b08cb0e.jpg#2,a little girl climbing into a wooden playhouse .,1000268201_693b08cb0e.jpg,flickr8k/Flicker8k_Dataset
3,1000268201_693b08cb0e.jpg#3,a little girl climbing the stairs to her playh...,1000268201_693b08cb0e.jpg,flickr8k/Flicker8k_Dataset
4,1000268201_693b08cb0e.jpg#4,a little girl in a pink dress going into a woo...,1000268201_693b08cb0e.jpg,flickr8k/Flicker8k_Dataset
...,...,...,...,...
40455,997722733_0cb5439472.jpg#0,a man in a pink shirt climbs a rock face,997722733_0cb5439472.jpg,flickr8k/Flicker8k_Dataset
40456,997722733_0cb5439472.jpg#1,a man is rock climbing high in the air .,997722733_0cb5439472.jpg,flickr8k/Flicker8k_Dataset
40457,997722733_0cb5439472.jpg#2,a person in a red shirt climbing up a rock fac...,997722733_0cb5439472.jpg,flickr8k/Flicker8k_Dataset
40458,997722733_0cb5439472.jpg#3,a rock climber in a red shirt .,997722733_0cb5439472.jpg,flickr8k/Flicker8k_Dataset


In [28]:
with open(os.path.join(datadir, 'coco/annotations/captions_train2017.json')) as f:
    coco_train = json.load(f)
    
coco_df = pd.DataFrame(coco_train['annotations'])
coco_df['img'] = ['0'*(12-len(str(id)))+str(id)+'.jpg' for id in coco_df['image_id']]
coco_df.drop(labels='image_id', axis=1, inplace=True)
coco_df.columns = range(len(coco_df.columns))
coco_df[1] = coco_df[1].str.lower()
coco_df[3] = 'coco/train2017'

In [29]:
coco_df

Unnamed: 0,0,1,2,3
0,37,a bicycle replica with a clock as the front wh...,000000203564.jpg,coco/train2017
1,49,a room with blue walls and a white sink and door.,000000322141.jpg,coco/train2017
2,89,a car that seems to be parked illegally behind...,000000016977.jpg,coco/train2017
3,98,a large passenger airplane flying through the ...,000000106140.jpg,coco/train2017
4,101,there is a gol plane taking off in a partly cl...,000000106140.jpg,coco/train2017
...,...,...,...,...
591748,829655,a slice of bread is covered with a sour cream ...,000000133071.jpg,coco/train2017
591749,829658,a long plate hold some fries with some sliders...,000000410182.jpg,coco/train2017
591750,829665,two women sit and pose with stuffed animals.,000000180285.jpg,coco/train2017
591751,829693,white plate with a lot of guacamole and an ext...,000000133071.jpg,coco/train2017


Now we collect the annotations that match our regular expressions and keep one for each image.

In [51]:
def filter_annotations(df, people_re, normgroup_re, testgroup_re):
    people_df = df[df[1].str.contains(people_re)]
    testgroup_df = people_df[people_df[1].str.contains(testgroup_re)].drop_duplicates(2)
    normgroup_df = people_df[people_df[1].str.contains(normgroup_re)].drop_duplicates(2)
    both_df = testgroup_df[testgroup_df[2].isin(normgroup_df[2])]
    testgroup_df = testgroup_df.drop(both_df.index)
    normgroup_df = normgroup_df.drop(both_df.index)
    seen_imgs = set([img for df in [testgroup_df, normgroup_df, both_df] for img in df[2]])
    no_attribute_df = people_df.drop(people_df[people_df[2].isin(seen_imgs)].index)
    return {'testgroup': testgroup_df, 'normgroup': normgroup_df, 'no_mention': no_attribute_df}

In [3099]:
flickr_set = filter_annotations(flickr_df, people_re, white_re, nonwhite_re)
coco_set = filter_annotations(coco_df, people_re, white_re, nonwhite_re)

  people_df = df[df[1].str.contains(people_re)]
  testgroup_df = people_df[people_df[1].str.contains(testgroup_re)].drop_duplicates(2)
  normgroup_df = people_df[people_df[1].str.contains(normgroup_re)].drop_duplicates(2)


In [3086]:
def print_group_sizes(dataset, name):
    print(name+':')
    for groupname in dataset:
        print(f'{groupname}: {len(dataset[groupname])} examples')

In [3100]:
print_group_sizes(flickr_set, 'flickr')
print()
print_group_sizes(coco_set, 'coco')

flickr:
testgroup: 238 examples
normgroup: 14 examples
no_mention: 24234 examples

coco:
testgroup: 442 examples
normgroup: 53 examples
no_mention: 209965 examples


As can be seen from these numbers, the amount of annotations that mention ethnicity or skin color is much larger for images of non-white people than for images of white people. In order to get more balanced numbers, we need to manually collect a set of images of white people from the set of images where the annotators do not mention ethnicity. To do that, I create an iterator that shows the images and yields the image id, so that I can append it to a list if the person in the image seems to be white. 

In [82]:
def Generate_img(data_dict, datadir, sample_size=400):
    samples_df = data_dict['no_mention'].drop_duplicates(2).sample(n=sample_size, axis=0)
    img_ids = [(row[3], row[2]) for index, row in samples_df.iterrows()]
    for img_id in img_ids:
        display(Image(os.path.join(datadir, img_id[0], img_id[1])))
        yield img_id[1]

In [2139]:
generate_flickr = Generate_img(flickr_set, datadir, 600)
flickr_white = []
flickr_nonwhite = []

In [3064]:
imgid = next(generate_flickr)

StopIteration: 

In [3063]:
flickr_white.append(imgid)

In [3008]:
flickr_nonwhite.append(imgid)

In [2804]:
# In case I make a mistake and need to remove the last item from the list:
flickr_white = flickr_white[:-1]

In [2841]:
flickr_nonwhite = flickr_nonwhite[:-1]

In [3065]:
# Let's see how many images were actually of white and non-white people respectively:
len(flickr_white), len(flickr_nonwhite)

(269, 47)

In [3066]:
with open('flickr_white_imgs.txt', 'w') as f:
    for img in flickr_white:
        f.write(img+'\n')

In [939]:
generate_coco = Generate_img(coco_set, datadir, 800)
coco_white = []
coco_nonwhite = []

In [2136]:
imgid = next(generate_coco)

StopIteration: 

In [2135]:
coco_white.append(imgid)

In [2088]:
coco_nonwhite.append(imgid)

In [2046]:
# Again, in case of mistakes:
coco_white = coco_white[:-1]

In [1912]:
coco_nonwhite = coco_nonwhite[:-1]

In [2137]:
len(coco_white), len(coco_nonwhite)

(297, 85)

In [2138]:
with open('coco_white_imgs.txt', 'w') as f:
    for img in coco_white:
        f.write(img+'\n')

Creating the contrasting sentence pairs:

In [3068]:
with open('flickr_white_imgs.txt', 'r') as f:
    flickr_white_imgs = [line.strip() for line in f]
with open('coco_white_imgs.txt', 'r') as f:
    coco_white_imgs = [line.strip() for line in f]
len(flickr_white_imgs), len(coco_white_imgs)

(269, 297)

In [3128]:
def make_sentence_pairs(datadict, norm_imgs):
    no_mention = datadict['no_mention']
    norm_no_mention_df = no_mention[no_mention[2].isin(norm_imgs)].drop_duplicates(2)
    norm_no_mention_df[4] = 'norm_no_mention'
    datadict['normgroup'][4] = 'norm_mention'
    test_mention_df = datadict['testgroup']
    
    # adding attribute to descriptions of white people:
    newdescs_norm = []
    for desc in norm_no_mention_df[1]:
        match = re.search(people_re, desc)
        if match[1] and match[1][-1]=='n':
            splits = desc.split(match[1])
            splits.insert(1, 'a white')
            newdesc = ''.join(splits)
            newdescs_norm.append(newdesc)
        else:
            splits = desc.split(match[2])
            splits.insert(1, 'white '+match[2])
            newdesc = ''.join(splits)
            newdescs_norm.append(newdesc)
            
    norm_mention_df = pd.DataFrame(norm_no_mention_df[:])        
    norm_mention_df[1] = newdescs_norm
    norm_mention_df[4] = "norm_mention"
    norm_mention_df = pd.concat([norm_mention_df[:], datadict['normgroup'][:]])

    # removing attributes from descriptions of white people:
    newdescs_nomention = []
    for desc in datadict['normgroup'][1]:
        match = re.search(white_re, desc)
        if match[1]:
            if match[1][-1]=='n' and match[3][0] != 'i':
                splits = desc.split(match[0])
                splits.insert(1, 'a '+match[3]+' ')
                newdesc = ''.join(splits)
                newdescs_nomention.append(newdesc)
            elif match[1]=='a' and match[3][0] == 'i':
                splits = desc.split(match[0])
                splits.insert(1, 'an '+match[3]+' ')
                newdesc = ''.join(splits)
                newdescs_nomention.append(newdesc)
            else:
                splits = desc.split(match[2]+' ')
                newdesc = ''.join(splits)
                newdescs_nomention.append(newdesc)
        else:
            splits = desc.split(match[2]+' ')
            newdesc = ''.join(splits)
            newdescs_nomention.append(newdesc)
    
    new_norm_no_mention_df = datadict['normgroup'][:]
    new_norm_no_mention_df[1] = newdescs_nomention
    new_norm_no_mention_df[4] = "norm_no_mention"
    norm_no_mention_df = pd.concat([norm_no_mention_df[:], new_norm_no_mention_df[:]])

    # removing attributes from descriptions of nonwhite people:
    newdescs_nomention = []
    for desc in test_mention_df[1]:
        match = re.search(nonwhite_re, desc)
        if match[1]:
            if match[1][-1]=='n' and match[3][0] != 'i':
                splits = desc.split(match[0])
                splits.insert(1, 'a '+match[3]+' ')
                newdesc = ''.join(splits)
                newdescs_nomention.append(newdesc)
            elif match[1]=='a' and match[3][0] == 'i':
                splits = desc.split(match[0])
                splits.insert(1, 'an '+match[3]+' ')
                newdesc = ''.join(splits)
                newdescs_nomention.append(newdesc)
            else:
                splits = desc.split(match[2]+' ')
                newdesc = ''.join(splits)
                newdescs_nomention.append(newdesc)
        else:
            splits = desc.split(match[2]+' ')
            newdesc = ''.join(splits)
            newdescs_nomention.append(newdesc)
    
    test_no_mention_df = test_mention_df[:]
    test_no_mention_df[1] = newdescs_nomention
    test_no_mention_df[4] = "test_no_mention"
    test_mention_df[4] = 'test_mention'
    
    return {'norm_no_mention': norm_no_mention_df, 
            'norm_mention': norm_mention_df, 
            'test_no_mention': test_no_mention_df, 
            'test_mention': test_mention_df}

In [3102]:
flickr_full_set = make_sentence_pairs(flickr_set, flickr_white_imgs)
coco_full_set = make_sentence_pairs(coco_set, coco_white_imgs)

Finally, let's combine the examples from Flickr8k and COCO, and save each separate test group as a csv file:

In [3120]:
def combine_dataframes(datadicts):
    '''Given a list of dicts output by the make_sentence_pairs() function, 
    returns a single dict where the dataframes corresponding to the same key of 
    the input dicts have been concatenated.
    '''
    keys = ['norm_no_mention', 'norm_mention', 'test_no_mention', 'test_mention']
    newdict = {}
    for key in keys:
        newdict[key] = pd.concat([dict[key][:] for dict in datadicts])
    return newdict

In [3121]:
full_set = combine_dataframes([flickr_full_set, coco_full_set])

In [3127]:
!mkdir datasets
for group in full_set:
    full_set[group].to_csv(f'datasets/{group}.csv', index=False)