In [1]:
from hashlib import md5
from pathlib import Path
from tqdm.notebook import tqdm
import pickle

In [37]:
NAS_PATH = Path("~/data/fishsense_process_work/preprocess_laser_jpeg").expanduser()
NAS_CHECKSUM_DICT_PATH = Path("~/data/nas_checksum_dict.pkl").expanduser()

GDRIVE_PATH = Path("~/data/gdrive/E4E Projects/FishSense/Shared with Collaborators/REEF/FishSpecies").expanduser()

MAP_DICT_PATH = Path("~/data/map_dict.pkl").expanduser()

MAP_CSV = Path("./species_map.csv")

NAS_PATH.exists(), NAS_CHECKSUM_DICT_PATH.exists(), GDRIVE_PATH.exists(), MAP_DICT_PATH.exists(), MAP_CSV.exists()

(True, True, True, True, False)

In [3]:
gdrive_jpgs = list(GDRIVE_PATH.glob("*.JPG"))

len(gdrive_jpgs)

3000

In [4]:
nas_jpgs = list(NAS_PATH.glob("*.JPG"))

len(nas_jpgs)

40091

In [12]:
def search_nas_by_checksum(checksum: str) -> Path | None:
    if NAS_CHECKSUM_DICT_PATH.exists():
        with open(NAS_CHECKSUM_DICT_PATH, "rb") as f:
            nas_checksums = pickle.load(f)
        checksum_to_path = {v: k for k, v in nas_checksums.items()}
    else:
        nas_checksums = {}
        checksum_to_path = {}

    if checksum in checksum_to_path:
        path = checksum_to_path[checksum]
        return path

    for i, p in enumerate(tqdm(nas_jpgs)):
        if p in nas_checksums:
            continue

        nas_checksum = md5(p.read_bytes()).hexdigest()
        nas_checksums[p] = nas_checksum

        if nas_checksum == checksum:
            break

        if i % 10 == 0:
            with open(NAS_CHECKSUM_DICT_PATH, "wb") as f:
                pickle.dump(nas_checksums, f)

    with open(NAS_CHECKSUM_DICT_PATH, "wb") as f:
        pickle.dump(nas_checksums, f)

    return p

In [27]:
if MAP_DICT_PATH.exists():
    with open(MAP_DICT_PATH, "rb") as f:
        map_dict = pickle.load(f)
else:
    map_dict = {}

In [28]:
for i, p in enumerate(tqdm(gdrive_jpgs)):
    if p in map_dict:
        continue

    tqdm.write(f"Processing {p.name} ({i+1}/{len(gdrive_jpgs)})")

    checksum = md5(p.read_bytes()).hexdigest()
    
    nas_path = search_nas_by_checksum(checksum)
    map_dict[p] = nas_path

    tqdm.write(f"Mapped to {nas_path.name if nas_path else 'NOT FOUND'}")

    if i % 10 == 0:
        with open(MAP_DICT_PATH, "wb") as f:
            pickle.dump(map_dict, f)

with open(MAP_DICT_PATH, "wb") as f:
    pickle.dump(map_dict, f)

  0%|          | 0/3000 [00:00<?, ?it/s]

Processing 98_20d2cda4cdbd092a6700c4aa4a203fbf.JPG (2992/3000)
Mapped to e200079e00ed7d6373044549e707ebb3.JPG
Processing 99_e200079e00ed7d6373044549e707ebb3.JPG (2993/3000)
Mapped to 7ae84c5c16b767b8f65ca559c373ed65.JPG
Processing 100_7ae84c5c16b767b8f65ca559c373ed65.JPG (2994/3000)
Mapped to 6313f95dc12c1dba9c5fdf4ff2338193.JPG
Processing 101_6313f95dc12c1dba9c5fdf4ff2338193.JPG (2995/3000)
Mapped to 95486b0e06655b35245d62e5241859b7.JPG
Processing 102_95486b0e06655b35245d62e5241859b7.JPG (2996/3000)
Mapped to d110412cbcbe415518dc324502d758e9.JPG
Processing 103_d110412cbcbe415518dc324502d758e9.JPG (2997/3000)
Mapped to 66913809f7599623d11e2bbfd697c001.JPG
Processing 104_66913809f7599623d11e2bbfd697c001.JPG (2998/3000)
Mapped to 0a0f2151ac4f7a814d1bc62987eea562.JPG
Processing 3_0ea63f5f073b0c6c4749c86ddd666925.JPG (2999/3000)
Mapped to f18aebcf2cfe3a390c3407d05085d0de.JPG
Processing 4_f18aebcf2cfe3a390c3407d05085d0de.JPG (3000/3000)
Mapped to cf1ea9b270e1c3a0894ab38bae4206b4.JPG


In [34]:
filtered_map_dict = {k.name: v.name for k, v in map_dict.items()}
filtered_map_dict = {k.split('_')[1]: v for k, v in filtered_map_dict.items()}
filtered_map_dict = {k.split('.')[0]: v.split('.')[0] for k, v in filtered_map_dict.items() if k != v}

filtered_map_dict

{'8425323c429395cca532d409b5aa3b54': '4ae8c7448e81014842009e0d0453273f',
 '4ae8c7448e81014842009e0d0453273f': 'ee747067df21bd56a197e4d7df50063c',
 'ee747067df21bd56a197e4d7df50063c': 'a26d7ce1b2eec2e59b70aa0dcb3231b1',
 'a26d7ce1b2eec2e59b70aa0dcb3231b1': 'c82857290465d8837d62d456cf902baa',
 'c82857290465d8837d62d456cf902baa': '726e0e790528188a7f2e931b9ee4a367',
 '726e0e790528188a7f2e931b9ee4a367': 'ee70151082fce6d51ad244a664dc61b5',
 'ee70151082fce6d51ad244a664dc61b5': 'dfbf48a2ef307c3b95f1ba3a1cdd2eae',
 'dfbf48a2ef307c3b95f1ba3a1cdd2eae': '3e94e4e399718a9a8a69061509a76372',
 '3e94e4e399718a9a8a69061509a76372': '65fdfc8838c39dfe933a3080382b206e',
 '65fdfc8838c39dfe933a3080382b206e': 'e4360b0eee9415486b652c686ec5d00f',
 'a9ceb6fe3169d04a574681a724317b8d': '95525d52c14e3439bb87b2a3792d36ed',
 '95525d52c14e3439bb87b2a3792d36ed': 'a1c104cc113fb475142c3895a236056f',
 'a1c104cc113fb475142c3895a236056f': '0bfabfec6b05948d2cec2733611a6026',
 '0bfabfec6b05948d2cec2733611a6026': '12e88514650ff

In [39]:
with MAP_CSV.open("w") as f:
    f.write("gdrive_checksum,nas_checksum\n")
    for key, value in filtered_map_dict.items():
        f.write(f"{key},{value}\n")