### Imports

In [None]:
import os
from glob import glob
from pprint import pprint
from collections import OrderedDict

import numpy as np
import ipyplot

from remove_duplicates import RemoveDuplicates

from _utils import foreach_dir, delete_broken_image, delete_junk_files, get_file_ext, sort_dict, print_dict

### Config

In [None]:
ROOT_FOLDER = './my-cool-images'
ROOT_FOLDER_GLOB = f'{ROOT_FOLDER}/*'

### Delete all junk files before proceeding

In [None]:
delete_junk_files(ROOT_FOLDER)

### Delete duplicate images

Search for exact duplicates and display them for visual confirmation (don't delete yet!)

In [None]:
duplicates = RemoveDuplicates(ROOT_FOLDER_GLOB, 'exact', False)

duplicates_found = duplicates.check_duplicates()

if duplicates_found:
  ipyplot.plot_images(
    duplicates_found,
    max_images=None,
    img_width=150
  )

If these are all in fact duplicates, delete them!

In [None]:
# script only deletes when run from cli
if duplicates_found:
  os.system(f'python ./remove_duplicates.py --cli --exact --delete "{ROOT_FOLDER_GLOB}"')

Look for more possible duplicates by searching for similar images this time

In [None]:
similar_images = RemoveDuplicates(ROOT_FOLDER_GLOB, 'similar', False)

similar_found = similar_images.check_duplicates()

if similar_found:
  ipyplot.plot_images(
    similar_found,
    max_images=None,
    img_width=150
  )

If these too are duplicates delete them as well

In [None]:
if similar_found:
  os.system(f'python ./remove_duplicates.py --cli --similar --delete "{ROOT_FOLDER_GLOB}"')

### Delete corrupt images that can't be opened

In [None]:
foreach_dir(ROOT_FOLDER, delete_broken_image, log=False)

### Let's get some counts to see where we're at...

In [None]:
CLASSES = {}
EXTENSIONS = {}
TOTAL_IMAGES = 0

for sub_folder in glob(ROOT_FOLDER_GLOB):
  if not os.path.isdir(sub_folder): continue

  folder = os.path.basename(sub_folder)
  files = os.listdir(sub_folder)
  file_count = len(files)

  CLASSES[folder] = file_count
  TOTAL_IMAGES += file_count

  for file in files:
    extention = get_file_ext(file)

    if extention in EXTENSIONS:
      EXTENSIONS[extention] += 1
    else:
      EXTENSIONS[extention] = 1


all_images_counts = sorted(CLASSES.values())
mean_image_count = np.array(all_images_counts).mean().round(0)

### All image counts per class (sorted form least to greatest)

In [None]:
ALL_CLASSES = sort_dict(CLASSES)

print_dict(ALL_CLASSES, abc_order_keys=False)

### Totals, min, max and mode

In [None]:
TOTAL_CLASSES = len(CLASSES)

print(f'Total Classes: {TOTAL_CLASSES}')
print(f'Total Images: {TOTAL_IMAGES:,}', '\n')

print(f'Min Images: {all_images_counts[0]}')
print(f'Max images: {all_images_counts[-1]}')
print(f'Average # of images: {mean_image_count}')