In [27]:
import contextlib
import sqlite3
import urllib.parse
from pathlib import Path

from label_studio_sdk.client import LabelStudio
from label_studio_sdk.projects.client_ext import ProjectExt
from label_studio_sdk.types import BaseUser
from tqdm.autonotebook import tqdm

from backend import get_file_checksum, get_project_export
from config import settings

In [23]:
data_root = Path(r'\\e4e-nas.ucsd.edu\fishsense_data\REEF\data')
data_db = Path(r'\\e4e-nas.ucsd.edu\fishsense_data\REEF\processing.db')

In [2]:
export = get_project_export(
    project_id=settings.label_studio.laser_project_id,
    label_studio_api_key=settings.label_studio.api_key,
    label_studio_host=settings.label_studio.host
)

In [26]:
images = {task['id']:Path(urllib.parse.parse_qs(urllib.parse.urlparse(task['data']['img']).query)['d'][0]).relative_to('fs_png_labeling_project_laser/REEF/data').with_suffix('.ORF').as_posix() for task in export}
images

{33564: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010001.ORF',
 33565: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010002.ORF',
 33566: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010003.ORF',
 33567: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010004.ORF',
 33568: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010005.ORF',
 33569: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010006.ORF',
 33570: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010007.ORF',
 33571: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010008.ORF',
 33572: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010009.ORF',
 33573: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010010.ORF',
 33574: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010011.ORF',
 33575: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010012.ORF',
 33576: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010013.ORF',
 33577: '2023-09-07 REEF Data Dump/080123_FSL-01 Photos/P8010014.ORF',
 33578

In [39]:
duplicate_task_ids = {}
bad_paths = []
with contextlib.closing(sqlite3.connect(data_db)) as con, contextlib.closing(con.cursor()) as cur:
    for task_id, task_file in tqdm(images.items()):
        cur.execute('SELECT * FROM images WHERE images.path = :path LIMIT 1;', {'path': task_file})
        result = cur.fetchall()
        if len(result) == 1:
            # this is the primary
            cur.execute('UPDATE images SET laser_task_id = :task_id WHERE images.path = :path',
                        {
                            'task_id': task_id,
                            'path': task_file
                        })
            con.commit()
            continue
        # This is a duplicate file
        full_image_path = data_root.joinpath(task_file)
        if not full_image_path.exists():
            bad_paths.append((full_image_path, task_id))
            continue
        checksum = get_file_checksum(full_image_path)
        if checksum not in duplicate_task_ids:
            duplicate_task_ids[checksum] = [task_id]
        else:
            duplicate_task_ids[checksum].append(task_id)


100%|██████████| 64614/64614 [12:03<00:00, 89.36it/s] 


In [40]:
bad_paths

[(WindowsPath('//e4e-nas.ucsd.edu/fishsense_data/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator CenterDrift_FSL04/P6200004.ORF'),
  100210),
 (WindowsPath('//e4e-nas.ucsd.edu/fishsense_data/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator CenterDrift_FSL04/P6200005.ORF'),
  100213),
 (WindowsPath('//e4e-nas.ucsd.edu/fishsense_data/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator CenterDrift_FSL04/P6200006.ORF'),
  100216),
 (WindowsPath('//e4e-nas.ucsd.edu/fishsense_data/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator CenterDrift_FSL04/P6200007.ORF'),
  100219),
 (WindowsPath('//e4e-nas.ucsd.edu/fishsense_data/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator CenterDrift_FSL04/P6200008.ORF'),
  100222),
 (WindowsPath('//e4e-nas.ucsd.edu/fishsense_data/REEF/data/2024.06.20.REEF/06_2024_Alligator/062024_Alligator CenterDrift_FSL04/P6200009.ORF'),
  100225),
 (WindowsPath('//e4e-nas.ucsd.edu/fishsense_data/REEF/data/2024.06.20.