In [1]:
import cv2
import numpy as np
import pandas as pd
import os
pd.set_option('display.max_rows', None)


In [2]:
label_path = "C:\\Users\\tfurr\\Downloads\\Labeling Checklist - Sheet1 (1).csv"

df = pd.read_csv(label_path)

In [3]:
df.head()

Unnamed: 0,id,manifestid,documentid,Full,uploaddatetimejsonmetadata,Folder,File,LABEL,SIGN OUT,ALLIGATOR,CATEGORY 1,CATEGORY 2,CATEGORY 3,POOR QUALITY,NOTES,onpremfilepathjsonmetadata,Unnamed: 16,Unnamed: 17
0,EXAMPLE,EXAMPLE,EXAMPLE,EXAMPLE,EXAMPLE,EXAMPLE,EXAMPLE,Vogt,4/27,True,dunnage,strap,airbag,True,...,EXAMPLE,PROGRESS,
1,EXAMPLE,EXAMPLE,EXAMPLE,EXAMPLE,EXAMPLE,EXAMPLE,EXAMPLE,Vogt,4/27,False,restack,return to level,pass,False,...,EXAMPLE,697,<-- Count
2,180426,43417541,83328644,TRUE,4/7/2023 7:25,328,644-0.jpg,Vogt,4/29,False,pass,,,False,,\\\\offtffs01\\Transflo\\Storage\\LHTRAILER\\C...,13.94%,<-- % Complete
3,180429,43418071,83328656,FALSE,4/7/2023 7:26,328,656-0.jpg,Vogt,4/29,False,dunnage,,,False,,\\\\offtffs01\\Transflo\\Storage\\LHTRAILER\\C...,,
4,180421,43416911,83328720,FALSE,4/7/2023 7:24,328,720-0.jpg,Vogt,4/29,False,pass,,,False,,\\\\offtffs01\\Transflo\\Storage\\LHTRAILER\\C...,,


In [4]:
#Only going to keep select columns for now

data1 = df[['id', 'manifestid', 'Folder', 'File','SIGN OUT', 'CATEGORY 1']]

data=data1.copy()

# If SIGN OUT is empty then we haven't labeled it yet
data.dropna(subset=['SIGN OUT'], inplace=True)
data.shape

(699, 6)

In [5]:

# First two rows are examples so we can drop them
data = data[2:]
data.shape

(697, 6)

In [6]:
data.head(20)

Unnamed: 0,id,manifestid,Folder,File,SIGN OUT,CATEGORY 1
2,180426,43417541,328,644-0.jpg,4/29,pass
3,180429,43418071,328,656-0.jpg,4/29,dunnage
4,180421,43416911,328,720-0.jpg,4/29,pass
5,180420,43416841,328,779-0.jpg,4/29,airbag
6,180422,43416961,328,780-0.jpg,4/29,pass
7,180423,43416961,328,781-0.jpg,4/29,pass
8,180424,43416961,328,782-0.jpg,4/29,airbag
9,180425,43416961,328,783-0.jpg,4/29,return to level
10,180427,43417881,328,784-0.jpg,4/29,pass
11,180428,43418021,328,786-0.jpg,4/29,strap


In [7]:
# We need to see if we have labeled a picture as a duplicate. This makes it so that CATEGORY 1 is empty except for ones that have Duplicate in them


def update_category(row):
    if pd.isna(row['CATEGORY 1']):
        return ''
    elif 'DUPLICATE' in row['CATEGORY 1']:
        return row['CATEGORY 1']
    elif 'Duplicate' in row['CATEGORY 1']:
        return row['CATEGORY 1']
    else:
        return ''
    
data['CATEGORY 1'] = data.apply(update_category, axis=1)

In [8]:
data

Unnamed: 0,id,manifestid,Folder,File,SIGN OUT,CATEGORY 1
2,180426,43417541,328,644-0.jpg,4/29,
3,180429,43418071,328,656-0.jpg,4/29,
4,180421,43416911,328,720-0.jpg,4/29,
5,180420,43416841,328,779-0.jpg,4/29,
6,180422,43416961,328,780-0.jpg,4/29,
7,180423,43416961,328,781-0.jpg,4/29,
8,180424,43416961,328,782-0.jpg,4/29,
9,180425,43416961,328,783-0.jpg,4/29,
10,180427,43417881,328,784-0.jpg,4/29,
11,180428,43418021,328,786-0.jpg,4/29,


In [9]:
# Just to check

(data=='').sum()

id              0
manifestid      0
Folder          0
File            0
SIGN OUT        0
CATEGORY 1    575
dtype: int64

In [10]:
def compare_images(image_path1, image_path2, target_size=(1900, 2600)):
    img1 = cv2.imread(image_path1, cv2.IMREAD_GRAYSCALE)
    img2 = cv2.imread(image_path2, cv2.IMREAD_GRAYSCALE)
    
    if img1 is None or img2 is None:
        print(f"Error: Failed to read one or both of the images: '{image_path1}', '{image_path2}'")
        return 0.0
    
    # If two images are not the same size, we resize them
    if img1.shape != img2.shape:
        img1 = cv2.resize(img1, target_size)
        img2 = cv2.resize(img2, target_size)
        
    if img1.shape != img2.shape:
        print(f"Error: Template image '{image_path1}' and target image '{image_path2}' have different dimensions")
        return 0.0



    # Apply Gaussian Blur to reduce noise in each photo
    img1 = cv2.GaussianBlur(img1, (5, 5), 0)
    img2 = cv2.GaussianBlur(img2, (5, 5), 0)

    # Perform template matching
    result = cv2.matchTemplate(img1, img2, cv2.TM_CCOEFF_NORMED)
    similarity = cv2.minMaxLoc(result)[1]

    return similarity


In [11]:
#compare_images("C:\\Users\\tfurr\\OneDrive\\Documents\\Photos_all\\342\\342\\687-0.jpg", "C:\\Users\\tfurr\\OneDrive\\Documents\\Photos_all\\342\\342\\688-0.jpg")

In [12]:
def check_duplicates(base_image_path, sim_score):
    '''
    base_image_path: the path to where the images are located in your folder
    sim_score: The minimum similarity score threshold. Above this and the two photos will be labeled as duplicates
    '''
    
    data['Test'] = ''  # Create a new column 'Test'

    # Group dataframe rows by manifestid
    grouped = data.groupby('manifestid')

    for _, group in grouped:
        if len(group) <= 1:
            continue  # Skip groups with only one image

        image_paths = []

        for _, row in group.iterrows():
            folder, filename = row['Folder'], row['File']
            image_path = f"{base_image_path}\\{folder}\\{folder}\\{filename}"
            image_paths.append(image_path)
        #print(image_paths)

        for i in range(len(image_paths)):
            for j in range(i + 1, len(image_paths)):
                similarity = compare_images(image_paths[i], image_paths[j])

                if similarity > sim_score:
                    image_name1 = os.path.basename(image_paths[i])
                    image_name2 = os.path.basename(image_paths[j])
                    folder_name1 = os.path.basename(os.path.dirname(image_paths[i]))
                    folder_name2 = os.path.basename(os.path.dirname(image_paths[j]))
                    # Update 'Test' column with the duplicate information for image_name2 only
                    data.loc[(data['Folder'] == folder_name2) & (data['File'] == image_name2), 'Test'] = f"Duplicate of {image_name1}"
    return data

In [None]:
c80 = check_duplicates("C:\\Users\\tfurr\\OneDrive\\Documents\\Photos_all", 0.8)
c52 = check_duplicates("C:\\Users\\tfurr\\OneDrive\\Documents\\Photos_all", 0.52)

In [None]:
c80

In [None]:
# If cell value is not empty it returns True and returns False otherwise. This is so we can see those that are labeled as duplicates

def is_empty(cell_value):
    if cell_value != '':
        return True
    else:
        return False




In [None]:
def create_cross_tab(data):
    cross_table = pd.crosstab(data['CATEGORY 1'].apply(is_empty), data['Test'].apply(is_empty))
    return cross_table

In [None]:
ct52 = create_cross_tab(c52)

In [None]:
ct80 = create_cross_tab(c80)

In [None]:
ct52

In [None]:
ct80