In [4]:
import urllib.parse
from pathlib import Path

import psycopg
from label_studio_sdk.client import LabelStudio
from label_studio_sdk.projects.client_ext import ProjectExt
from label_studio_sdk.types import BaseUser
from psycopg.rows import dict_row
from tqdm.auto import tqdm

from fishsense_data_processing_spider.backend import (get_file_checksum,
                                                      get_project_export)
from fishsense_data_processing_spider.config import PG_CONN_STR, settings

In [10]:
data_root = Path('./mnt/REEF/data')

In [None]:
export = get_project_export(
    project_id=10,
    label_studio_api_key=settings.label_studio.api_key,
    label_studio_host=settings.label_studio.host
)

In [6]:
images = {task['id']: Path(urllib.parse.parse_qs(urllib.parse.urlparse(task['data']['img']).query)['d'][0]).relative_to(
    'fs_png_labeling_project_laser/REEF/data').with_suffix('.ORF').as_posix() for task in export}
images

{33685: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010122.ORF',
 33713: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010150.ORF',
 33759: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010196.ORF',
 33771: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010208.ORF',
 33772: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010209.ORF',
 33815: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010252.ORF',
 33863: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010300.ORF',
 34014: '2023.08.03.FishSense.FSL-01D/H slate dive 1/P8030003.ORF',
 34125: '2023-09-07 REEF Data Dump/082923_Pool Calibration/082329_Slate_FSL05/P8230001.ORF',
 34196: '2023-09-07 REEF Data Dump/082923_Pool Calibration/082923_Slate_FSL02/P8290016.ORF',
 34202: '2023-09-07 REEF Data Dump/082923_Pool Calibration/082929_Slate_FSL04/P8290017.ORF',
 34235: '2023-09-07 REEF Data Dump/082923_Pool Calibration/082929_Slate_FSL06/P8290023.ORF',
 34399: '2023-09-07 REEF Data Dump/082923_Pool Calibration/0829

In [11]:
duplicate_task_ids = {}
bad_paths = []
with psycopg.connect(PG_CONN_STR) as con, con.cursor() as cur:
    for task_id, task_file in tqdm(images.items()):
        cur.execute(
            'SELECT * FROM images WHERE images.path = %(path)s LIMIT 1;', {'path': task_file})
        result = cur.fetchall()
        if len(result) == 1:
            # this is the primary
            cur.execute('UPDATE images SET laser_task_id = %(task_id)s WHERE images.path = %(path)s',
                        {
                            'task_id': task_id,
                            'path': task_file
                        })
            con.commit()
            continue
        # This is a duplicate file
        full_image_path = data_root.joinpath(task_file)
        if not full_image_path.exists():
            bad_paths.append((full_image_path, task_id))
            continue
        checksum = get_file_checksum(full_image_path)
        if checksum not in duplicate_task_ids:
            duplicate_task_ids[checksum] = [task_id]
        else:
            duplicate_task_ids[checksum].append(task_id)

  0%|          | 0/64614 [00:00<?, ?it/s]

In [12]:
bad_paths

[(PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator Anchor_FSL04/P6200052.ORF'),
  100354),
 (PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator Anchor_FSL04/P6200058.ORF'),
  100372),
 (PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator Anchor_FSL04/P6200062.ORF'),
  100384),
 (PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator Anchor_FSL04/P6200077.ORF'),
  100429),
 (PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator Anchor_FSL04/P6200047.ORF'),
  100339),
 (PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator Anchor_FSL04/P6200066.ORF'),
  100396),
 (PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator Anchor_FSL04/P6200068.ORF'),
  100402),
 (PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator CenterDrift_FSL04/P6200008.ORF'),
  100222),
 (PosixPath('mnt/REEF/data/2024.06.20.REEF/06_2024_