## Initialization

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

from google.colab import auth
import gspread
from google.auth import default

import hashlib
# from scipy.misc.pilutil import imread, imresize, imshow
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import time
import numpy as np

Mounted at /content/gdrive


## Get and Show Current Directory

In [None]:
import os
os.getcwd()

'/content/gdrive/MyDrive/Colab Notebooks/Dataset/Zamioculcas'

## Collect Folders from Desinated Directory

In [None]:
root_path = '/content/gdrive/MyDrive/Colab Notebooks/Dataset'
folders = os.listdir(root_path)
fs = list()
for folder in folders:
  fs.append(folder)
fs

['Dracaena_trifasciata',
 'Zamioculcas',
 'Monstera_deliciosa',
 'Dypsis_lutescens',
 'Rhapis_excelsa',
 'Epipremnum_aureum',
 'Asplenium_nidus',
 'Maranta_leuconeura',
 'Philodendron_gloriosum',
 'Pachira_aquatica',
 'Peperomia_argyreia',
 'Aglaonema_commutatum',
 'Schefflera_arboricola',
 'Hoya_carnosa',
 'Dracaena_reflexa',
 'Tradescantia_zebrina',
 'Calathea_orbifolia',
 'Nephrolepis_cordifolia_',
 'Chamaedorea_elegans',
 'Peperomia_obtusifolia']

## Identify Duplicates and Capture in a List

In [None]:
duplicates = dict()
for folder in fs:
  # Change directory
  path = os.path.join(root_path, folder)
  os.chdir(path)
  os.getcwd()

  # Get # of files in the folder
  file_list = os.listdir()
  print(f'{folder}: {len(file_list)}')

  # Loop through duplicates
  duplicates[folder] = list()
  hash_keys = dict()
  for id, filename in enumerate(os.listdir('.')): # listdir('.') = current directory
    if os.path.isfile(filename):
      with open(filename, 'rb') as f:
        filehash = hashlib.md5(f.read()).hexdigest()
      if filehash not in hash_keys:
        hash_keys[filehash] = id
      else:
        duplicates[folder].append((id, hash_keys[filehash]))

Dracaena_trifasciata: 403
Zamioculcas: 571
Monstera_deliciosa: 714
Dypsis_lutescens: 495
Rhapis_excelsa: 480
Epipremnum_aureum: 550
Asplenium_nidus: 635
Maranta_leuconeura: 433
Philodendron_gloriosum: 305
Pachira_aquatica: 362
Peperomia_argyreia: 281
Aglaonema_commutatum: 643
Schefflera_arboricola: 393
Hoya_carnosa: 616
Dracaena_reflexa: 337
Tradescantia_zebrina: 433
Calathea_orbifolia: 285
Nephrolepis_cordifolia_: 205
Chamaedorea_elegans: 293
Peperomia_obtusifolia: 370


In [None]:
duplicates

{'Dracaena_trifasciata': [],
 'Zamioculcas': [(424, 333)],
 'Monstera_deliciosa': [(411, 384)],
 'Dypsis_lutescens': [],
 'Rhapis_excelsa': [(275, 251), (362, 319), (417, 251)],
 'Epipremnum_aureum': [(390, 338)],
 'Asplenium_nidus': [],
 'Maranta_leuconeura': [(178, 171), (230, 208), (328, 282), (360, 321)],
 'Philodendron_gloriosum': [],
 'Pachira_aquatica': [(36, 35)],
 'Peperomia_argyreia': [],
 'Aglaonema_commutatum': [],
 'Schefflera_arboricola': [(272, 258)],
 'Hoya_carnosa': [],
 'Dracaena_reflexa': [(246, 222), (262, 252), (284, 252)],
 'Tradescantia_zebrina': [],
 'Calathea_orbifolia': [],
 'Nephrolepis_cordifolia_': [],
 'Chamaedorea_elegans': [],
 'Peperomia_obtusifolia': [(186, 64),
  (187, 155),
  (192, 177),
  (197, 159),
  (199, 163),
  (203, 110),
  (210, 175),
  (211, 60),
  (226, 184),
  (232, 202),
  (233, 194),
  (235, 153),
  (236, 206),
  (247, 189)]}

## Show Image

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [None]:
for key, file_indexes in duplicates.items():
  path = os.path.join(root_path, key)
  # Change directory
  os.chdir(path)
  os.getcwd()

  # Get # of files in the folder
  file_list = os.listdir()
  print(f'{key}: {len(file_list)}')

  if len(file_indexes) == 0:
    print('No Duplicates!')
  else:
    for ids in file_indexes:
      try:

        plt.subplot(121),plt.imshow(mpimg.imread(file_list[ids[1]]))
        plt.title(ids[1]), plt.xticks([]), plt.yticks([])

        plt.subplot(122),plt.imshow(mpimg.imread(file_list[ids[0]]))
        plt.title(str(ids[0]) + ' duplicate'), plt.xticks([]), plt.yticks([])
        plt.show()

      except OSError as e:
        continue

Output hidden; open in https://colab.research.google.com to view.

## Delete Image

In [None]:
for key, file_indexes in duplicates.items():
  path = os.path.join(root_path, key)
  # Change directory
  os.chdir(path)
  os.getcwd()

  # Get # of files in the folder
  file_list = os.listdir()
  print(f'{key}: {len(file_list)}')

  for index in file_indexes:
    os.remove(file_list[index[0]])
  
  file_list = os.listdir()
  print(f'{key}: {len(file_list)}')

Dracaena_trifasciata: 403
Dracaena_trifasciata: 403
Zamioculcas: 571
Zamioculcas: 570
Monstera_deliciosa: 714
Monstera_deliciosa: 713
Dypsis_lutescens: 495
Dypsis_lutescens: 495
Rhapis_excelsa: 480
Rhapis_excelsa: 477
Epipremnum_aureum: 550
Epipremnum_aureum: 549
Asplenium_nidus: 635
Asplenium_nidus: 635
Maranta_leuconeura: 433
Maranta_leuconeura: 429
Philodendron_gloriosum: 305
Philodendron_gloriosum: 305
Pachira_aquatica: 362
Pachira_aquatica: 361
Peperomia_argyreia: 281
Peperomia_argyreia: 281
Aglaonema_commutatum: 643
Aglaonema_commutatum: 643
Schefflera_arboricola: 393
Schefflera_arboricola: 392
Hoya_carnosa: 616
Hoya_carnosa: 616
Dracaena_reflexa: 337
Dracaena_reflexa: 334
Tradescantia_zebrina: 433
Tradescantia_zebrina: 433
Calathea_orbifolia: 285
Calathea_orbifolia: 285
Nephrolepis_cordifolia_: 205
Nephrolepis_cordifolia_: 205
Chamaedorea_elegans: 293
Chamaedorea_elegans: 293
Peperomia_obtusifolia: 370
Peperomia_obtusifolia: 356
