# Data cleaning and augmentation

Preprocessing of dataset containing images from google streetview.
First delete all the images with no content and remove those from the .csv dataset.
Then do some data augmentation.

Import libraries

In [65]:
import cv2
import os
import matplotlib.pyplot as plt
import csv
import itertools
from itertools import combinations

Read list of all files in the dicectory to be cleaned

In [29]:
filesList = os.listdir("imagesgoogle_cleaned")

50f5ed9afdc9f065f0008e32.jpg


### Data cleaning
Compare each image to the template (containing "Sorry, we have no imagery here") and delete all the images with the same content (the norm between those and the template are -> 0)

In [25]:
template = cv2.imread("template.jpg")
deletedcount = 0

for fn in filesList:
    img = cv2.imread("imagesgoogle_cleaned/"+fn)
    try:
        if cv2.norm(template, img)< 10:
            print(fn)
            os.remove("imagesgoogle_cleaned/"+fn)
            deletedcount+=1
    except:
        print("Except: "+fn)
        pass
        
print("Deleted "+str(deletedcount)+" images")

Except: .DS_S
Except: .ipynb_checkpo
Deleted 0 images


Update csv file containing comparisons to get rid of comparison with deleted images

In [37]:
filesList = [f[:-4] for f in os.listdir("imagesgoogle_cleaned")]

# Read csv file and remove lines containing deleted images
with open("data/imagesgoogle.csv") as csv_file:
    newData = [row for row in csv.reader(csv_file) if row[0] in filesList and row[1] in filesList]

In [38]:
# Save the update image couples and corresponding labels
with open("data/imagesgoogle_cleaned.csv", 'w', newline = "") as myfile:
    wr = csv.writer(myfile)
    for row in newData:
        wr.writerow(row)

### Data augmentation

Increase the size of the dataset by:
    1. Using transitivity: if img[i] wins on img[j] and img[j] wins on img[k], then we assume img[i] wins on img[k]
    2. Mirroring images to increase 4x the size of the dataset.

Use transitivity to augment the dataset: if img[i] wins on img[j] and img[j] wins on img[k], then add a new entry in the dataset saying that img[i] wins on img[k]

In [42]:
with open("data/imagesgoogle_cleaned.csv") as csv_file:
    newData = [row for row in csv.reader(csv_file)]

In [68]:
def transitive_increase(dlist):
    nC = []
    tot = 0
    t1=0
    t2=0
    for i in range(0, len(dlist)):
        for j in range(i, len(dlist)):
            # 4 rules to extend the dataset through transitivity
            x,y = dlist[i], dlist[j]
            if x[1] == y[0] and x[2] == y[2]:
                nC.append([x[0],y[1],x[2]])
            elif x[0] == y[1] and x[2] == y[2]:
                nC.append([x[1],y[0], opposite[x[2]]])
            elif x[0] == y[0] and x[2] == opposite[y[2]]:
                nC.append([x[1],y[1],y[2]])
            elif x[1] == y[1] and x[2] == opposite[y[2]]:
                nC.append([x[0],y[0],x[2]])

    return nC

In [70]:
def remove_duplicates(li):
    li.sort()
    return list(li for li,_ in itertools.groupby(li))

In [74]:
data = newData + transitive_increase(newData)
data = remove_duplicates(data)

print(len(data))

# Repeat transitive increase as much as possible - CAREFUL ABOUT THIS! OUTLIERS IN DATASET MAKE IT IMPOSSIBLE TO REPEAT THIS PROCESS
'''
oldLen = 0

nC = transitive_increase(newData)
augmented_data = newData + nC 
augmented_data = [l for l in augmented_data if l[2]!="equals"] # Remove "equals" to prevent infinite loops

while(len(augmented_data) >= oldLen):
    oldLen = len(augmented_data)
    nC = remove_duplicates(transitive_increase(nC))
    augmented_data = augmented_data + nC
    print(len(augmented_data))

data = newData+nC
print("Data length with duplicates: "+str(len(data)))

data = remove_duplicates(data)

print("Data length without duplicates: "+str(len(data)))
'''

20403


'\noldLen = 0\n\nnC = transitive_increase(newData)\naugmented_data = newData + nC \naugmented_data = [l for l in augmented_data if l[2]!="equals"] # Remove "equals" to prevent infinite loops\n\nwhile(len(augmented_data) >= oldLen):\n    oldLen = len(augmented_data)\n    nC = remove_duplicates(transitive_increase(nC))\n    augmented_data = augmented_data + nC\n    print(len(augmented_data))\n\ndata = newData+nC\nprint("Data length with duplicates: "+str(len(data)))\n\ndata = remove_duplicates(data)\n\nprint("Data length without duplicates: "+str(len(data)))\n'

In [75]:
# Save the update image couples and corresponding labels
with open("data/imagesgoogle_cleaned.csv", 'w', newline = "") as myfile:
    wr = csv.writer(myfile)
    for row in data:
        wr.writerow(row)

Mirror images and add all possible combinations of unmirrored + mirrored images to csv file (NM-NM, NM-M, M-NM, M-M)

In [85]:
filesList = [f[:-4] for f in os.listdir("imagesgoogle_cleaned")]

for fn in filesList:
    try:
        img = cv2.imread("imagesgoogle_cleaned/"+fn+".jpg")
        img = cv2.flip(img, 1)
        cv2.imwrite("imagesgoogle_cleaned/"+fn+"_mirror.jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), 80])
    except:
        print("Except: "+fn)

Except: .DS_S
Except: .ipynb_checkpo


In [98]:
# Update the comparison and labeling data for mirrored images
with open("data/imagesgoogle_cleaned.csv") as csv_file:
    newData = [row for row in csv.reader(csv_file)]
    
mirror_extended = []
for d in newData:
    mirror_extended.append(d)
    mirror_extended.append([d[0]+"_mirror", d[1], d[2]])
    mirror_extended.append([d[0], d[1]+"_mirror", d[2]])
    mirror_extended.append([d[0]+"_mirror", d[1]+"_mirror", d[2]])
    
# Save the updated image couples and corresponding labels
with open("data/imagesgoogle_augmented.csv", 'w', newline = "") as myfile:
    wr = csv.writer(myfile)
    for row in mirror_extended:
        wr.writerow(row)

### Dataset balancing

Balance "right" and "left" winners by swapping some of the images.