In [None]:
import datetime as dt
import os
from datetime import datetime

import requests
import yaml
from PIL import Image
from PIL.ExifTags import GPSTAGS, TAGS

import seabeepy as sb
from seabeepy.config import SETTINGS

# Process seabirds data uploaded from the field

A very rough notebook to detect raw data uploaded to `seabirds/fielduploads`, restructure it, and copy it to `seabirds/2024`.

**Needs improving**, but it's a start.

The script iterates over all folders in `seabirds/fielduploads`. For each folder:

 1. Checks if the number of image files is equal to the `nfiles` attribute. If it is, the folder is ready for processing.
 
 2. Checks the folder for the presence of a file named `copied.txt`, which signifies that the folder has already been copied (and therefore should not be processed again). If `copied.txt` does *not* exist, the folder is processed.
 
 3. Folder names with **three** parts (e.g. `DJI_202401131645_002`) are considered to be "non-mission" folders. The data are copied without modification to `seabirds/2024/nonmission` and the config. file renamed to `rawconfig.seabee.yaml` (to stop the platform from trying to process it).
 
 4. Folder names with **four** parts (e.g. `DJI_202401131645_002_test`) are mission folders. If the `grouping`, `area` or `datetime` attributes are not already specified in the config. file (i.e. if any of them are set equal to `''`), the script will attempt to patch the config. file based on information read from image EXIF data. The `grouping` attribute is set as `'{county}-{municipality}'`; `'{area}'` is set as the nearest place name from the Kartverket API; and `'{datetime}'` is set based on the timestamp of the first image. A mission folder is then created at `seabirds/2024/{county}-{municipality}_{area}_{first_datetime}`. The patched/updated config. file is copied to this folder, and any image files (`.jpg`, `.jpeg` or `.tif`) are copied to an `images` subfolder. **Any other files will be ignored**.
 
 5. An empty text file named `copied.txt` is created in the raw upload folder on `seabirds/fielduploads` to signify the folder has been processed (so it will be skipped in step 2 when the script runs again).

In [None]:
# Functions copied from https://github.com/SeaBee-no/seabirds/blob/main/fielduploadstofolderTest.py


def get_exif_data(image):
    """Get embedded EXIF data from image file."""
    exif_data = {}
    info = image._getexif()
    if info:
        for tag, value in info.items():
            decoded = TAGS.get(tag, tag)
            exif_data[decoded] = value
    return exif_data


def get_gps_info(exif_data):
    """Extract GPS information from EXIF data."""
    if "GPSInfo" not in exif_data:
        return None

    gps_info = {}
    for key in exif_data["GPSInfo"].keys():
        decode = GPSTAGS.get(key, key)
        gps_info[decode] = exif_data["GPSInfo"][key]

    if (
        "GPSLatitude" in gps_info
        and "GPSLongitude" in gps_info
        and "GPSLatitudeRef" in gps_info
        and "GPSLongitudeRef" in gps_info
    ):
        lat = gps_info["GPSLatitude"]
        lon = gps_info["GPSLongitude"]
        lat_ref = gps_info["GPSLatitudeRef"]
        lon_ref = gps_info["GPSLongitudeRef"]

        lat = convert_to_degrees(lat)
        lon = convert_to_degrees(lon)

        if lat_ref != "N":
            lat = -lat
        if lon_ref != "E":
            lon = -lon

        return lat, lon
    return None


def convert_to_degrees(value):
    """Convert GPS coordinates to degrees in float format."""
    d = float(value[0].numerator) / float(value[0].denominator)
    m = float(value[1].numerator) / float(value[1].denominator)
    s = float(value[2].numerator) / float(value[2].denominator)
    return d + (m / 60.0) + (s / 3600.0)


def get_average_gps_and_first_timestamp(folder_path):
    latitudes = []
    longitudes = []
    first_timestamp = None

    for root, _, files in os.walk(folder_path):
        files = sorted(files)  # Ensure files are processed in a consistent order
        for file in files:
            if file.lower().endswith(("jpg", "jpeg")):
                file_path = os.path.join(root, file)
                try:
                    with Image.open(file_path) as img:
                        exif_data = get_exif_data(img)
                        gps_info = get_gps_info(exif_data)
                        if gps_info:
                            latitudes.append(gps_info[0])
                            longitudes.append(gps_info[1])
                        if not first_timestamp and (
                            "DateTimeOriginal" in exif_data or "DateTime" in exif_data
                        ):
                            timestamp = exif_data.get(
                                "DateTimeOriginal"
                            ) or exif_data.get("DateTime")
                            first_timestamp = timestamp
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    if latitudes and longitudes:
        avg_latitude = sum(latitudes) / len(latitudes)
        avg_longitude = sum(longitudes) / len(longitudes)
    else:
        avg_latitude = None
        avg_longitude = None

    formatted_timestamp = None
    if first_timestamp:
        try:
            date_time_obj = datetime.strptime(first_timestamp, "%Y:%m:%d %H:%M:%S")
            formatted_timestamp = date_time_obj.strftime("%Y%m%d%H%M")
        except Exception as e:
            print(f"Error formatting timestamp: {e}")

    return avg_latitude, avg_longitude, formatted_timestamp


def get_place_name(lat, lon, radius=1000):
    """Use Kartverket's API to get place name based on latitude and longitude.

    Args
        lat: Float. Latitude in decimal degrees.
        lon: Float. longitude in decimal degrees.
        radius: Int. Search radius for place names in metres.

    Returns
        Str or None. Nearest place name.
    """
    radius = int(radius)
    url = f"https://ws.geonorge.no/stedsnavn/v1/punkt?nord={lat}&ost={lon}&koordsys=4326&radius={radius}&treffPerSide=500"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data and (len(data["navn"]) > 0):
            closest = min(data["navn"], key=lambda x: x["meterFraPunkt"])
            place_name = closest["stedsnavn"][0]["skrivemåte"]
            return place_name
    return None


def get_municipality_and_county(lat, lon):
    """Use Kartverket's API to get municipality and county based on latitude and longitude."""
    url = f"https://ws.geonorge.no/kommuneinfo/v1/punkt?nord={lat}&ost={lon}&koordsys=4326"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        municipality = data.get("kommunenavn", "Unknown")
        county = data.get("fylkesnavn", "Unknown")
        return municipality, county
    return None, None


def get_file_list(folder_path):
    """Get a list of file paths for transfer to the images folder

    The following extensions .nav, .obs, .bin, and .mrk. are checked
    and return in the list.
    """
    extensions = ("jpg", "jpeg", ".nav", ".obs", ".bin", ".mrk")
    file_list = [
        folder_path + "/" + file
        for file in os.listdir(folder_path)
        if file.lower().endswith(extensions)
    ]
    return file_list


def check_image_count(folder_path):
    """Check if the number of image files matches the count in config.seabee.yaml."""
    yaml_file = os.path.join(folder_path, "config.seabee.yaml")
    if not os.path.exists(yaml_file):
        print(f"config.seabee.yaml not found in {folder_path}")
        return False

    with open(yaml_file, "r") as file:
        config = yaml.safe_load(file)

    expected_count = config.get("nfiles")
    if expected_count is None:
        print(f"nfiles not specified in {yaml_file}")
        return False

    file_list = get_file_list(folder_path)
    actual_count = len(file_list)

    if actual_count == expected_count:
        return True
    else:
        print(
            f"Image count mismatch in {folder_path}: expected {expected_count}, found {actual_count}"
        )
        return False


def is_already_copied(folder_path):
    """Check if the folder has already been copied."""
    txt_path = os.path.join(folder_path, "copied.txt")
    if os.path.exists(txt_path):
        return True
    else:
        return False


def mark_copied(folder_path, minio_client):
    """Create an empty text file named 'copied.text' to signficy that a folder has
    already been copied.
    """
    txt_path = os.path.join(folder_path, "copied.txt")
    bucket, obj_path = sb.storage._jhub_path_to_minio(txt_path)
    minio_path = os.path.join(bucket, obj_path)
    minio_client.touch(minio_path)


def write_config(path, data):
    with open(path, "w") as yaml_file:
        yaml.dump(data, yaml_file, default_flow_style=False, allow_unicode=True)

In [None]:
# Login to MinIO
minio_client = sb.storage.minio_login(
    user=SETTINGS.MINIO_ACCESS_ID, password=SETTINGS.MINIO_SECRET_KEY
)

In [None]:
# Set source and destination folders
field_dir = r"/home/notebook/shared-seabee-ns9879k/seabirds/fielduploads"
dst_dir = r"/home/notebook/shared-seabee-ns9879k/seabirds/2024"
temp_dir = r"/home/notebook/temp"

In [None]:
# Process data
folders = os.listdir(field_dir)
for folder in folders:
    folder_path = os.path.join(field_dir, folder)
    if is_already_copied(folder_path):
        continue  # Folder already processed

    print(f"\n################\nProcessing: {folder}")
    if not check_image_count(folder_path):
        continue  # Skip folders where image count does not match the config

    name_parts = folder.split("_")
    if len(name_parts) == 3:
        print("Non-mission folder.")
        dst = os.path.join(dst_dir, "nonmission")
        sb.storage.copy_folder(folder_path, dst, minio_client, containing_folder=True)

        # Rename config. so it's not processed
        config_path = os.path.join(dst, folder, "config.seabee.yaml")
        new_config_path = os.path.join(dst, folder, "rawconfig.seabee.yaml")
        sb.storage.copy_file(config_path, new_config_path, minio_client)
        sb.storage.delete_file(config_path, minio_client)

        mark_copied(folder_path, minio_client)
        print(f"Copied to '{dst}' and marked as 'copied'.")

    elif len(name_parts) == 4:
        print("Mission folder.")
        # Patch user-supplied data if necessary
        data = sb.ortho.parse_config(folder_path)
        avg_lat, avg_lon, first_timestamp = get_average_gps_and_first_timestamp(
            folder_path
        )
        if data["grouping"] == "" and avg_lat and avg_lon:
            municipality, county = get_municipality_and_county(avg_lat, avg_lon)
            if municipality and county:
                print(f"Municipality: {municipality}, County: {county}.")
                data["grouping"] = f"{county}-{municipality}"
            else:
                print("Unable to retrieve municipality and county.")
        else:
            print("No GPS data found in the images.")

        if data["area"] == "" and avg_lat and avg_lon:
            place_name = get_place_name(avg_lat, avg_lon, radius=1000)
            if place_name:
                print(f"Place Name: {place_name}")
                data["area"] = place_name
            else:
                print("Unable to retrieve place name.")
        else:
            print("No GPS data found in the images.")

        if data["datetime"] == "" and first_timestamp:
            print(f"First timestamp: {first_timestamp}.")
            data["datetime"] = first_timestamp
        else:
            print("Could not identify first timestamp.")

        if data["grouping"] == "" or data["area"] == "" or data["datetime"] == "":
            print("Incomplete mission metadata. Skipping.")
            continue
        else:
            # Copy updated config
            mission_name = sb.ortho.replace_norwegian_chars(
                f"{data['grouping']}_{data['area']}_{data['datetime']}"
            ).lower()
            temp_path = os.path.join(temp_dir, "config.seabee.yaml")
            config_path = os.path.join(dst_dir, mission_name, "config.seabee.yaml")
            write_config(temp_path, data)
            sb.storage.copy_file(temp_path, config_path, minio_client)
            os.remove(temp_path)

            # Copy images
            image_list = get_file_list(folder_path)
            for image_path in image_list:
                image_name = os.path.basename(image_path)
                image_dst = os.path.join(dst_dir, mission_name, "images", image_name)
                sb.storage.copy_file(image_path, image_dst, minio_client)
            mark_copied(folder_path, minio_client)
            print(f"Copied to '{mission_name}' and marked as 'copied'.")
    else:
        print(f"Cannot parse folder name  '{folder}'.")