#### Code to detect duplicates

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
import cv2
import glob
import torch
import imagehash
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from collections import Counter
from tqdm.notebook import tqdm

In [None]:
from params import *

In [None]:
from data.preparation import prepare_new_data, prepare_extra_data

from data.dataset import CovidDataset
from data.transforms import get_transfos

# from model_zoo.models import define_model

from utils.plot import plot_sample
from utils.logger import prepare_log_folder, save_config, create_logger, update_overall_logs

## Data

In [None]:
def get_hashes():
    names = []
    hashes = []
    hashes_t = []

    funcs = [
        imagehash.average_hash,
        imagehash.phash,
        imagehash.dhash,
        imagehash.whash,
    ]
    
    for path in tqdm(glob.glob('../input/train_512/*.png')):
        img_name = path.split('/')[-1]

        image = Image.open(path)
        image_t = image.transpose(Image.FLIP_LEFT_RIGHT)

        names.append(img_name)
        hashes.append(np.array([f(image).hash for f in funcs]).reshape(-1))
        hashes_t.append(np.array([f(image_t).hash for f in funcs]).reshape(-1))

    return hashes, hashes_t, names

In [None]:
hashes, hashes_t, names = get_hashes()

In [None]:
hashes = torch.Tensor(np.array(hashes).astype(int)).cuda()
hashes_t = torch.Tensor(np.array(hashes_t).astype(int)).cuda()

In [None]:
# sims = [(hashes - hashes[i]).abs().mean(-1).cpu().numpy() for i in range(hashes.shape[0])] 
sims = [(hashes[i] == hashes).float().mean(-1).cpu().numpy() for i in range(hashes.shape[0])]
sims = np.array(sims)
sims -= np.eye(sims.shape[0])

sims_t = [(hashes[i] == hashes_t).float().mean(-1).cpu().numpy() for i in range(hashes.shape[0])]
sims_t = np.array(sims_t)
sims_t -= np.eye(sims_t.shape[0])

In [None]:
THRESHOLD = 0.95

In [None]:
found = []
clusts = []
transpositions = []
for i in tqdm(range(len(names))):
    if names[i] in found:
        continue
    
    transposed = [False]
    clust = [names[i]]
    for j in range(len(names)):
        if sims[i, j] > THRESHOLD:
            found.append(names[j])
            clust.append(names[j])
            transposed.append(False)
        elif sims_t[i, j] > THRESHOLD:
            found.append(names[j])
            clust.append(names[j])
            transposed.append(True)

    if len(clust) > 1:
        clusts.append(clust)
        found.append(names[i])
        transpositions.append(transposed)

In [None]:
print(f'found {len(found)} duplicates in {len(clusts)} clusters')

In [None]:
np.save("../output/clusts.npy", np.array(clusts, dtype=object))
np.save("../output/found.npy", np.array(found))
np.save("../output/transpositions.npy", np.array(transpositions, dtype=object))

In [None]:
root = DATA_PATH + f"train_{SIZE}/"

for clust, tran in zip(clusts, transpositions):
    print(f'Clust {clust}')
    print(f'Trans {tran}')
    
    plt.figure(figsize=(15, (len(clust) // 3 + 1) * 5))
    for i, n in enumerate(clust):
        plt.subplot(len(clust) // 3 + 1, 3, i + 1)
        img = cv2.imread(root + n)
        plt.imshow(img)
        plt.axis(False)

    plt.show()

# Merging

In [None]:
from utils.boxes import *
from data.preparation import prepare_dataframe
from utils.plot import plot_sample

In [None]:
df = prepare_dataframe().copy()

In [None]:
clusts = np.load("../output/clusts.npy", allow_pickle=True)
found = np.load("../output/found.npy")
transpositions = np.load("../output/transpositions.npy", allow_pickle=True)

In [None]:
df = handle_duplicates(df, clusts, transpositions, plot=True)