In [36]:
from PIL import Image
import os
import hashlib
from collections import defaultdict

In [28]:
# crop images and save to directory

def crop(filename, height, width):
    im = Image.open(filename)
    imgwidth, imgheight = im.size
    im_list = []
    
#     create filename for proper naming in folder
    filename = filename.split('/')[-1]
    filename = filename.split('.')[0]
    
    k = 0
    for i in range(0,imgheight,height):
        for j in range(0,imgwidth,width):
            box = (j, i, j+width, i+height)
            a = im.crop(box)
            a.save('mountains_unsorted/%s_%s.jpg' % (filename, k))
            im_list.append(a)
            k += 1
    return im_list

In [29]:
# loop through file names in directory (for mountain pics)

directory = 'recaptchapics_source/mountains/'

# sub-image dimensions
height = 100
width = 100


for f in os.listdir(directory):
    if not f.startswith('.'):
        file_path = os.path.join(directory, f)
        crop(file_path, height, width)

In [33]:
# check and remove duplicate images

def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [34]:
# test checksum
f = 'mountains_data/other/mountains1_6.jpg'
md5(f)

'ec6698a5097c0b66a47c55632ff0df60'

In [58]:
# create dict of files to see duplicates (key is hash, values are filenames)

directory = 'mountains_unsorted/'
files_d = defaultdict(list)

for f in os.listdir(directory):
    if not f.startswith('.'):
        file_path = os.path.join(directory, f)
        checksum = md5(file_path)
        files_d[checksum].append(f)

In [59]:
for k,v in files_d.items():
    if len(v) > 1:
#         print(k, files_d[k])
        files_d[k] = [v[0]]

In [60]:
len(files_d)

10170

In [62]:
# for k, v in files_d.items():
#     print(k, v)

In [63]:
files_to_keep = [v[0] for _,v in files_d.items()]

In [64]:
files_to_keep[:12]

['mountains1137_1.jpg',
 'mountains3411_1.jpg',
 'mountains690_8.jpg',
 'mountains497_1.jpg',
 'mountains118_4.jpg',
 'mountains423_4.jpg',
 'mountains99_8.jpg',
 'mountains2217_3.jpg',
 'mountains2772_8.jpg',
 'mountains3371_8.jpg',
 'mountains1241_3.jpg',
 'mountains3042_6.jpg']

In [67]:
# remove files not in files_to_keep (duplicates)

directory = 'mountains_unsorted/'

for f in os.listdir(directory):
    file_path = os.path.join(directory, f)
    if not f.startswith('.'):
        if f not in files_to_keep:
            os.remove(file_path)