In [10]:
import imgsim
import cv2
import numpy as np
import os
from itertools import combinations
from image_duplication_check import image_distance
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from tqdm import tqdm
import copy
from image_duplication_check import load_and_vectorize_image
from IPython.display import Image, display, HTML


In [11]:
def is_valid_image(file_path):
    try:
        with open(file_path, 'rb') as f:
            header = f.read(10)
            if header.startswith(b'\xff\xd8'):
                # JPEG header
                return True
            elif header.startswith(b'\x89PNG\r\n\x1a\n'):
                # PNG header
                return True
            elif header[:6] in (b'GIF87a', b'GIF89a'):
                # GIF header
                return True
            else:
                return False
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return False

In [None]:
# ディレクトリを標準入力から受け取る
dir_path = input("Directory: ")
# if windows path, convert to unix path
if "\\" in dir_path:
    dir_path = dir_path.replace("\\", "/")
    # X:/... -> /mnt/x/...
    dir_path = "/mnt/" + dir_path[0].lower() + dir_path[2:]
dir_path

In [13]:
file_path_list = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))]


In [14]:
# 画像として読み込めるファイルのみを抽出
image_path_list = []
for file_path in file_path_list:
    if is_valid_image(file_path):
        image_path_list.append(file_path)
    else:
        print(f"Invalid file: {file_path}")

In [None]:
vtr = imgsim.Vectorizer()

with tqdm(total=len(image_path_list)) as progress:
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: 
        futures = []  
        for file_path in image_path_list: 
            future = executor.submit(load_and_vectorize_image, file_path, vtr)
            future.add_done_callback(lambda p: progress.update())
            futures.append((file_path, future))
        images = [(f[0], f[1].result()) for f in futures]


In [None]:
vtr = imgsim.Vectorizer()
image_combinations = combinations(images, 2)

with tqdm(total=len(list(copy.copy(image_combinations)))) as progress:
    with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor: 
        futures = []  
        for (file_path_a, image_a), (file_path_b, image_b) in image_combinations: 
            future = executor.submit(image_distance, image_a, image_b)
            future.add_done_callback(lambda p: progress.update())
            futures.append(((file_path_a, file_path_b), future))
        result = [{'file_path': f[0], 'dist': f[1].result()} for f in futures]


In [None]:
import matplotlib.pyplot as plt

for r in result:
    if (r['dist'] < 0.1):
        file_path_a = r['file_path'][0]
        file_path_b = r['file_path'][1]
        image_a = cv2.imread(file_path_a)
        image_a = cv2.cvtColor(image_a, cv2.COLOR_BGR2RGB)
        image_b = cv2.imread(file_path_b)
        image_b = cv2.cvtColor(image_b, cv2.COLOR_BGR2RGB)
        
        plt.figure(figsize=(10, 4))  # 2つの画像を横に並べるための図のサイズを設定
        # 1行2列のサブプロットを作成し、1番目のサブプロットにimage1を配置
        plt.subplot(1, 2, 1)
        plt.imshow(image_a)
        plt.title(os.path.basename(file_path_a))

        # 1行2列のサブプロットを作成し、2番目のサブプロットにimage2を配置
        plt.subplot(1, 2, 2)
        plt.imshow(image_b)
        plt.title(os.path.basename(file_path_b))

        plt.show()  # 画像を表示
        

In [None]:
print("Delete duplicate images? (y/n)")
if (input() == "y"):
    for r in result:
        if (r['dist'] < 0.1):
            file_path_b = r['file_path'][1]
            try:
                os.remove(file_path_b)
                print("Delete: " + file_path_b)
            except FileNotFoundError:
                print("File not found: " + file_path_b)
