In [1]:
import numpy as np
import os
from typing import List
import cv2
import scipy.ndimage

# Functions

In [2]:
def folderToPaths(folder: str):
    entries = os.listdir(folder)
    entries.sort()

    return entries


def filterTuples(l: list,
                 fileFormatDiscriminator: str = 'png',
                 pos1: int = -5,
                 pos2: int = -6,
                 idFile1: str = '1',
                 idFile2: str = '2'):
    tuples = []
    singletons = []
    for idx in range(1, len(l)):
        start1 = pos1 if fileFormatDiscriminator in l[idx - 1] else pos2
        start2 = pos1 if fileFormatDiscriminator in l[idx] else pos2
        id1 = l[idx - 1][start1]
        id2 = l[idx][start2]
        if id1 == idFile1 and id2 == idFile2:
            tuples.append((l[idx - 1], l[idx]))
        elif id1 == idFile1:
            singletons.append(l[idx - 1])

            if idx == len(l) - 1:
                singletons.append(l[idx])

    return singletons, tuples


def checkTuples(tuples: List[tuple], patientIDLength: int = 6):
    for t in tuples:
        assert t[0][:6] == t[1][:6]


def checkLength(tuples: List[tuple], singletons: list, targetLen: int):
    assert 2 * len(set(tuples)) + len(set(singletons)) == targetLen


def detectNoisyImagesPaths(path1, path2, folderPath):
    im1 = cv2.imread(os.path.join(folderPath, path1), flags = cv2.IMREAD_GRAYSCALE)
    im1 = scipy.ndimage.gaussian_laplace(im1, sigma = 1)

    im2 = cv2.imread(os.path.join(folderPath, path2), flags = cv2.IMREAD_GRAYSCALE)
    im2 = scipy.ndimage.gaussian_laplace(im2, sigma = 1)

    sum1 = np.sum(im1)
    sum2 = np.sum(im2)

    if sum1 > sum2:
        return path1
    elif sum2 > sum1:
        return path2

    else:
        return None


def detectBlurredImagesPaths(path1, path2, folderPath):
    im1 = cv2.imread(os.path.join(folderPath, path1), flags = cv2.IMREAD_GRAYSCALE)
    im1 = scipy.ndimage.gaussian_laplace(im1, sigma = 1)

    im2 = cv2.imread(os.path.join(folderPath, path2), flags = cv2.IMREAD_GRAYSCALE)
    im2 = scipy.ndimage.gaussian_laplace(im2, sigma = 1)

    var1 = np.var(im1)
    var2 = np.var(im2)

    if var1 < var2:
        return path1
    elif var2 < var1:
        return path2

    else:
        return None


def getNoisyImagesPaths(tuples, folderPath):
    cleanImagesPaths = []

    for idx, t in enumerate(tuples):
        #print(idx)
        p = detectNoisyImagesPaths(path1 = t[0], path2 = t[1], folderPath = folderPath)

        if p is not None:
            cleanImagesPaths.append(p)

    return cleanImagesPaths


def getBlurredImagesPaths(tuples, folderPath):
    cleanImagesPaths = []

    for idx, t in enumerate(tuples):
        #print(idx)
        p = detectBlurredImagesPaths(path1 = t[0], path2 = t[1], folderPath = folderPath)

        if p is not None:
            cleanImagesPaths.append(p)

    return cleanImagesPaths

In [3]:
def __sharpenImage(ima):
    sharpen_kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])

    sharpened = cv2.filter2D(ima, -1, sharpen_kernel)
    # sharpened = cv2.filter2D(sharpened, -1, sharpen_kernel)

    return sharpened

In [4]:
f = "data/train_all_no_duplicates"
duplicates_folder = 'data/train_duplicates_to_remove'
noisy_folder = 'data/train_noisy'
temp_folder = 'data/temp'
paths = folderToPaths(folder = f)
singletons, tuples = filterTuples(paths)

In [5]:
checkTuples(tuples)

In [6]:
checkLength(tuples = tuples, singletons = singletons, targetLen = 15470)

# Handle duplicate images

In [7]:
noisyImagesPaths = getNoisyImagesPaths(tuples = tuples, folderPath = f)

In [None]:
blurredImagesPaths = getBlurredImagesPaths(tuples = tuples, folderPath = f)

In [None]:
noisyImagesToRemoveIDs = [i[:6] for i in noisyImagesPaths]

In [None]:
imagesToRemove = noisyImagesPaths

In [None]:
imagesToKeep = [x for x in paths if x not in imagesToRemove]

### 15.470 -> 12.086 Images after duplicates removal

In [None]:
for imPath in imagesToRemove:
    os.rename(os.path.join(f, imPath), os.path.join(duplicates_folder, imPath))