# Process DNG from GDrive and Upload to Nextcloud (CONFIGURE BEFORE RUNNING)

In [None]:
# ==============================================================================
# --- 1. SETUP AND AUTHENTICATION FOR GOOGLE COLAB ---
# ==============================================================================

# Install necessary libraries and download ExifTool
!pip install pyexiftool webdavclient3 tqdm pandas rawpy opencv-python-headless numpy google-api-python-client google-auth-httplib2 google-auth-oauthlib > /dev/null
!wget -q -O Image-ExifTool-13.34.tar.gz https://sourceforge.net/projects/exiftool/files/Image-ExifTool-13.34.tar.gz/download
!tar -xzf Image-ExifTool-13.34.tar.gz
!mv -f Image-ExifTool-13.34/* .

import os
import logging
import requests
import time
import rawpy
import cv2
import json
import numpy as np
import pandas as pd
import exiftool
import subprocess
from io import BytesIO
from datetime import datetime, timezone
from webdav3 import client as wc

# --- Third-party libraries ---
from google.colab import auth
from google.auth import default
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload, MediaIoBaseUpload
from tqdm.notebook import tqdm

# --- Colab Specific Setup ---
auth.authenticate_user()
from google.colab import userdata

# ==============================================================================
# --- 2. CONFIGURATION ---
# ==============================================================================
# --- Google Drive Settings ---
SOURCE_ROOT_FOLDER_ID = userdata.get("SOURCE_ROOT_FOLDER_ID")
LOG_FOLDER_ID = userdata.get("LOG_FOLDER_ID")

# --- Nextcloud Settings (Destination) ---
NEXTCLOUD_HOSTNAME = userdata.get("NEXTCLOUD_HOSTNAME")
NEXTCLOUD_USERNAME = userdata.get("NEXTCLOUD_USERNAME")
NEXTCLOUD_PASSWORD = userdata.get("NEXTCLOUD_PASSWORD")
NEXTCLOUD_ROOT_PATH = userdata.get("NEXTCLOUD_ROOT_PATH")
FAILED_API_FOLDER_NAME = "FAILED"

# --- TBE API Settings ---
TBE_API_URL = userdata.get("TBE_API_URL")
TBE_API_KEY = userdata.get("TBE_API_KEY")

# --- State Management ---
MASTER_LOG_CSV = "master_log.csv"
MAX_RUNTIME_SECONDS = 11 * 3600 + 45 * 60


MONITORING_KEYWORDS = [
    "tikus",
    "tanaman",
    "ngengat",
    "ulat",
    "penggerek"
]
MIN_JPEG_SIZE_MB = 1.0
MAX_JPEG_SIZE_MB = 5.0

# --- Required Metadata Keys ---
REQUIRED_METADATA_KEYS = [
    "EXIF:GPSLatitude",
    "EXIF:GPSLatitudeRef",
    "EXIF:GPSLongitude",
    "EXIF:GPSLongitudeRef",
    "EXIF:GPSAltitude",
    "XMP:AbsoluteAltitude",
    "XMP:RelativeAltitude",
    "XMP:GimbalYawDegree",
    "XMP:GimbalPitchDegree",
    "XMP:GimbalRollDegree"
]

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.getLogger().setLevel(logging.INFO)


# ==============================================================================
# --- 3. HELPER FUNCTIONS (Fungsi Bantuan) ---
# ==============================================================================

def extract_all_metadata_with_exiftool(dng_byte_stream, temp_dng_path="temp.dng"):
    lat, lon, all_metadata = None, None, {}
    try:
        with open(temp_dng_path, "wb") as f:
            f.write(dng_byte_stream.getvalue())

        with exiftool.ExifToolHelper(executable="./exiftool") as et:
            all_metadata = et.get_metadata(temp_dng_path)[0]

        if "EXIF:GPSLatitude" in all_metadata and "EXIF:GPSLongitude" in all_metadata:
            lat = all_metadata["EXIF:GPSLatitude"]
            lon = all_metadata["EXIF:GPSLongitude"]
            if 'S' in all_metadata.get('EXIF:GPSLatitudeRef', 'N'): lat = -lat
            if 'W' in all_metadata.get('EXIF:GPSLongitudeRef', 'E'): lon = -lon
    except Exception as e:
        logging.error(f"Error extracting metadata with ExifTool: {e}")
    finally:
        if os.path.exists(temp_dng_path):
            os.remove(temp_dng_path)
    return lat, lon, all_metadata

def get_nextcloud_client():
    options = {'webdav_hostname': NEXTCLOUD_HOSTNAME, 'webdav_login': NEXTCLOUD_USERNAME, 'webdav_password': NEXTCLOUD_PASSWORD}
    try:
        client = wc.Client(options); client.verify = True; return client
    except Exception as e:
        logging.error(f"Failed to initialize Nextcloud client: {e}"); return None

def ensure_nextcloud_folder(client, remote_path):
    try:
        if not client.check(remote_path): client.mkdir(remote_path)
        return True
    except Exception as e:
        logging.error(f"Failed to ensure Nextcloud folder '{remote_path}': {e}"); return False

def upload_to_nextcloud(jpeg_bytes, remote_path):
    try:
        jpeg_bytes.seek(0)
        full_url = f"{NEXTCLOUD_HOSTNAME}/{remote_path.lstrip('/')}"
        response = requests.put(full_url, data=jpeg_bytes, auth=(NEXTCLOUD_USERNAME, NEXTCLOUD_PASSWORD), headers={'Content-Type': 'image/jpeg'})
        response.raise_for_status()
        return True
    except Exception as e:
        logging.error(f"Failed to upload to Nextcloud path '{remote_path}': {e}"); return False

def find_log_file_on_drive(service, folder_id, filename):
    query = f"'{folder_id}' in parents and name = '{filename}' and trashed = false"
    try:
        response = service.files().list(q=query, spaces='drive', fields='files(id)').execute()
        return response.get('files', [{}])[0].get('id')
    except (HttpError, IndexError) as e:
        logging.warning(f"Could not find log file {filename} in Drive: {e}"); return None

def download_log_from_drive(service, file_id, local_path):
    if not file_id: return False
    try:
        request = service.files().get_media(fileId=file_id)
        fh = BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while not done: status, done = downloader.next_chunk()
        with open(local_path, 'wb') as f: f.write(fh.getvalue())
        logging.info(f"Successfully downloaded log file from Drive to '{local_path}'.")
        return True
    except HttpError as error:
        logging.error(f"Could not download log file: {error}"); return False

def update_log_on_drive(service, file_id, folder_id, local_path):
    media = MediaFileUpload(local_path, mimetype='text/csv', resumable=True)
    try:
        if file_id:
            service.files().update(fileId=file_id, media_body=media).execute()
        else:
            file_metadata = {'name': os.path.basename(local_path), 'parents': [folder_id]}
            file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
            return file.get('id')
    except HttpError as error:
        logging.error(f"Could not upload log file to Drive: {error}")
    return file_id

def list_files_with_retry(service, query, page_token):
    max_retries = 5; delay = 1
    for attempt in range(max_retries):
        try:
            return service.files().list(q=query, pageSize=1000, fields="nextPageToken, files(id, name, mimeType)", pageToken=page_token).execute()
        except HttpError as error:
            if error.resp.status in [500, 502, 503, 504]:
                logging.warning(f"API call failed with transient error {error.resp.status}. Retrying in {delay}s...")
                time.sleep(delay); delay *= 2
            else: raise
    logging.error(f"API call failed after {max_retries} retries for query: {query}"); return None

def traverse_drive(service, folder_id):
    # Use actual source folder name instead of 'ROOT'
    service2 = build('drive', 'v3', credentials=service._http.credentials)
    folder_info = service2.files().get(fileId=folder_id, fields='name').execute()
    source_folder_name = folder_info.get('name', 'ROOT')
    folders_to_scan = [(folder_id, source_folder_name)]
    while folders_to_scan:
        current_id, current_path = folders_to_scan.pop(0)
        query = f"'{current_id}' in parents and trashed = false"
        page_token = None
        while True:
            results = list_files_with_retry(service, query, page_token)
            if not results:
                logging.error(f"Could not list files in folder ID {current_id} after retries. Skipping folder."); break
            for item in results.get('files', []):
                item_path = os.path.join(current_path, item['name']).replace("\\", "/")
                if item['mimeType'] == 'application/vnd.google-apps.folder':
                    folders_to_scan.append((item['id'], item_path))
                elif item['name'].lower().endswith('.dng'):
                    yield item, os.path.splitext(item_path)[0]
            page_token = results.get('nextPageToken')
            if not page_token: break

# ==============================================================================
# --- 4. CORE PROCESSING LOGIC (Logika Inti) ---
# ==============================================================================

def compress_dng_to_jpeg_bytes(dng_bytes):
    """Hanya melakukan kompresi, tanpa menangani metadata EXIF."""
    try:
        dng_bytes.seek(0)
        with rawpy.imread(dng_bytes) as raw:
            rgb_image = raw.postprocess(use_camera_wb=True, output_bps=8)

        bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
        high_quality = -1

        for quality in range(95, 10, -5):
            encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
            result, encoded_image_data = cv2.imencode('.jpeg', bgr_image, encode_param)
            if not result: continue

            file_size_mb = len(encoded_image_data) / (1024 * 1024)
            if MIN_JPEG_SIZE_MB <= file_size_mb <= MAX_JPEG_SIZE_MB:
                logging.info(f"  > Compression success (coarse): Quality {quality} -> {file_size_mb:.2f} MB."); return BytesIO(encoded_image_data)

            if file_size_mb > MAX_JPEG_SIZE_MB:
                high_quality = quality
            elif file_size_mb < MIN_JPEG_SIZE_MB and high_quality != -1:
                low_quality = quality
                logging.info(f"  > Coarse search overshot. Starting fine search between {low_quality}-{high_quality}...")
                for fine_quality in range(high_quality - 1, low_quality, -1):
                    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), fine_quality]
                    result, encoded_image_data = cv2.imencode('.jpeg', bgr_image, encode_param)
                    if not result: continue
                    file_size_mb = len(encoded_image_data) / (1024 * 1024)
                    if MIN_JPEG_SIZE_MB <= file_size_mb <= MAX_JPEG_SIZE_MB:
                        logging.info(f"  > Compression success (fine): Quality {fine_quality} -> {file_size_mb:.2f} MB."); return BytesIO(encoded_image_data)
                logging.warning(f"  > Fine search failed."); return None
        logging.warning("  > Could not achieve target size."); return None
    except Exception as e:
        logging.error(f"  > Failed during DNG compression: {e}", exc_info=True); return None

def inject_metadata_with_exiftool(dng_bytes, jpeg_bytes, temp_dng="temp.dng", temp_jpeg="temp.jpeg"):
    """Menyuntikkan semua metadata dari DNG ke JPEG menggunakan ExifTool."""
    try:
        with open(temp_dng, "wb") as f:
            f.write(dng_bytes.getvalue())
        with open(temp_jpeg, "wb") as f:
            f.write(jpeg_bytes.getvalue())

        cmd = ["./exiftool", "-tagsFromFile", temp_dng, "-all:all", "-overwrite_original", temp_jpeg]
        subprocess.run(cmd, check=True, capture_output=True, text=True)

        with open(temp_jpeg, "rb") as f:
            final_jpeg_bytes = BytesIO(f.read())

        return final_jpeg_bytes
    except subprocess.CalledProcessError as e:
        logging.error(f"ExifTool failed to inject metadata: {e.stderr}")
        return None
    except Exception as e:
        logging.error(f"An error occurred during metadata injection: {e}")
        return None
    finally:
        if os.path.exists(temp_dng): os.remove(temp_dng)
        if os.path.exists(temp_jpeg): os.remove(temp_jpeg)

def process_and_upload_file(gdrive_service, nextcloud_client, file_info, original_gdrive_path):
    dng_id, original_filename = file_info['id'], file_info['name']
    log_data = {
        "dng_id": dng_id,
        "original_gdrive_path": original_gdrive_path,
        "original_filename": original_filename,
        "latitude": None,
        "longitude": None,
        "api_land_name": None,
        "api_land_id": None,
        "api_hst": None,
        "api_hss": None,
        "adjusted_hst": None,
        "adjusted_hss": None,
        "gdrive_uploaded_date": None,
        "hst_hss_negative": False,
        "api_response_json": None,
        "final_nextcloud_path": None,
        "status": "UNKNOWN_ERROR",
        "timestamp": datetime.now().isoformat(),
        "exif_data_json": None
    }

    try:
        # Get Google Drive uploaded date
        file_metadata = gdrive_service.files().get(fileId=dng_id, fields='createdTime').execute()
        gdrive_uploaded_date = file_metadata.get('createdTime', None)
        log_data["gdrive_uploaded_date"] = gdrive_uploaded_date
        request = gdrive_service.files().get_media(fileId=dng_id)
        dng_bytes = BytesIO(request.execute())
    except HttpError as e:
        logging.error(f"Failed to download {original_filename}: {e}"); log_data["status"] = "DOWNLOAD_FAIL"; return log_data

    lat, lon, all_exif_data = extract_all_metadata_with_exiftool(dng_bytes)
    missing_keys = [key for key in REQUIRED_METADATA_KEYS if key not in all_exif_data]
    if missing_keys:
        logging.warning(f"Skipping {original_filename} due to missing required metadata: {missing_keys}")
        log_data["status"] = "MISSING_METADATA_FAIL"
        log_data["exif_data_json"] = json.dumps(all_exif_data)
        return log_data
    if not all([lat, lon]):
        logging.warning(f"Skipping {original_filename} due to missing GPS metadata."); log_data["status"] = "NO_METADATA"; return log_data
    log_data.update({
        "latitude": lat,
        "longitude": lon,
        "exif_data_json": json.dumps(all_exif_data)
    })

    compressed_jpeg_bytes = compress_dng_to_jpeg_bytes(dng_bytes)
    if not compressed_jpeg_bytes:
        logging.error(f"Failed to compress {original_filename}. Skipping."); log_data["status"] = "COMPRESS_FAIL"; return log_data

    final_jpeg_bytes = inject_metadata_with_exiftool(dng_bytes, compressed_jpeg_bytes)
    if not final_jpeg_bytes:
        logging.error(f"Failed to inject metadata into {original_filename}. Skipping."); log_data["status"] = "METADATA_INJECT_FAIL"; return log_data

    new_filename_part = original_gdrive_path.replace('/', '_')
    try:
        params, headers = {'lat': lat, 'lng': lon}, {'Key': TBE_API_KEY}
        response = requests.get(TBE_API_URL, params=params, headers=headers, timeout=20)
        log_data["api_response_json"] = json.dumps(response.json())
        response.raise_for_status()

        if response.json().get('status') is True and response.json().get('data', {}).get('land_id'):
            api_data = response.json()['data']
            land_name = api_data.get('land_name', None)
            land_id = api_data['land_id']
            hst = int(api_data.get('land_hst', '0'))
            hss = int(api_data.get('land_hss', '0'))
            log_data["api_land_name"] = land_name
            log_data["api_land_id"] = land_id
            log_data["api_hst"] = hst
            log_data["api_hss"] = hss
            # Calculate days since upload
            days_since_upload = 0
            if gdrive_uploaded_date:
                try:
                    uploaded_dt = datetime.strptime(gdrive_uploaded_date, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
                    now_dt = datetime.now(timezone.utc)
                    days_since_upload = (now_dt - uploaded_dt).days
                except Exception as ex:
                    logging.warning(f"Could not parse gdrive_uploaded_date: {gdrive_uploaded_date}, error: {ex}")
            # Adjust hst/hss only if not zero
            adjusted_hst = hst - days_since_upload if hst != 0 else 0
            adjusted_hss = hss - days_since_upload if hss != 0 else 0
            log_data["adjusted_hst"] = adjusted_hst
            log_data["adjusted_hss"] = adjusted_hss
            if adjusted_hst < 0 or adjusted_hss < 0:
                log_data["hst_hss_negative"] = True
                logging.warning(f"Adjusted hst/hss is negative for file {original_filename}: adjusted_hst={adjusted_hst}, adjusted_hss={adjusted_hss}")
                log_data["status"] = "NEGATIVE_HST_HSS_SKIP"
                return log_data
            # Use adjusted value for foldering
            day_val = str(adjusted_hst) if adjusted_hst != 0 else str(adjusted_hss)
            main_folder_path = f"{NEXTCLOUD_ROOT_PATH}/{land_id}-{day_val}"
            if ensure_nextcloud_folder(nextcloud_client, main_folder_path):
                for keyword in MONITORING_KEYWORDS: ensure_nextcloud_folder(nextcloud_client, f"{main_folder_path}/monitoring {keyword}")
                target_subfolder = next((f"monitoring {kw}" for kw in MONITORING_KEYWORDS if kw in original_gdrive_path.lower()), "monitoring tanaman")
                final_path = f"{main_folder_path}/{target_subfolder}/{new_filename_part}.jpeg"
                log_data["final_nextcloud_path"] = final_path
                log_data["status"] = "SUCCESS" if upload_to_nextcloud(final_jpeg_bytes, final_path) else "UPLOAD_FAIL"
            else:
                log_data["status"] = "NEXTCLOUD_FOLDER_FAIL"
            return log_data
    except requests.exceptions.RequestException as e:
        logging.error(f"API request failed for {original_filename}: {e}"); log_data["api_response_json"] = json.dumps({"error": str(e)})

    # Fallback untuk kegagalan API
    logging.warning(f"API call failed for {original_filename}. Uploading to FAILED folder.")
    log_data["api_land_id"] = "API_FAIL"
    failure_path = f"{NEXTCLOUD_ROOT_PATH}/{FAILED_API_FOLDER_NAME}"
    ensure_nextcloud_folder(nextcloud_client, failure_path)
    final_path = f"{failure_path}/{new_filename_part}.jpeg"
    log_data["final_nextcloud_path"] = final_path
    log_data["status"] = "API_FAIL_UPLOADED" if upload_to_nextcloud(final_jpeg_bytes, final_path) else "API_FAIL_UPLOAD_FAIL"
    return log_data

# ==============================================================================
# --- 5. MAIN EXECUTION (Eksekusi Utama) ---
# ==============================================================================
def main():
    start_time = time.monotonic()
    creds, _ = default()
    gdrive_service = build('drive', 'v3', credentials=creds)
    nextcloud_client = get_nextcloud_client()
    if not nextcloud_client:
        logging.critical("Could not connect to Nextcloud. Aborting."); return

    log_file_id = find_log_file_on_drive(gdrive_service, LOG_FOLDER_ID, MASTER_LOG_CSV)
    download_log_from_drive(gdrive_service, log_file_id, MASTER_LOG_CSV)


    # Load master log and show status counts
    if os.path.exists(MASTER_LOG_CSV) and os.path.getsize(MASTER_LOG_CSV) > 0:
        master_log_df = pd.read_csv(MASTER_LOG_CSV)
    else:
        master_log_df = pd.DataFrame(columns=[
            "dng_id", "original_gdrive_path", "original_filename", "latitude", "longitude",
            "api_land_name", "api_land_id", "api_hst", "api_hss", "adjusted_hst", "adjusted_hss",
            "gdrive_uploaded_date", "hst_hss_negative", "api_response_json", "final_nextcloud_path",
            "status", "timestamp", "exif_data_json"
        ])

    # Show status counts before processing
    status_counts = master_log_df['status'].value_counts()
    logging.info("Status counts in master log before processing:")
    for status, count in status_counts.items():
        logging.info(f"  {status}: {count}")

    # Define statuses to skip reprocessing
    skip_statuses = ["SUCCESS", "API_FAIL_UPLOADED", "NEGATIVE_HST_HSS_SKIP"]

    # Get all DNG files from Drive
    all_tasks = list(traverse_drive(gdrive_service, SOURCE_ROOT_FOLDER_ID))
    # Build a dict for quick lookup by dng_id
    all_tasks_dict = {task[0]['id']: task for task in all_tasks}

    # Find DNGs to reprocess (status not in skip_statuses)
    to_reprocess_ids = set(master_log_df[~master_log_df['status'].isin(skip_statuses)]['dng_id'].astype(str))
    reprocess_tasks = [all_tasks_dict[dng_id] for dng_id in to_reprocess_ids if dng_id in all_tasks_dict]

    # Find DNGs never processed (not in master log)
    processed_dng_ids = set(master_log_df['dng_id'].astype(str))
    new_tasks = [task for task in all_tasks if task[0]['id'] not in processed_dng_ids]

    # Combine: reprocess first, then new
    tasks_to_run = reprocess_tasks + new_tasks
    logging.info(f"Found {len(all_tasks)} total DNG files. {len(tasks_to_run)} files to process (including reprocess).")

    for file_info, original_gdrive_path in tqdm(tasks_to_run, desc="Processing DNG Files"):
        if time.monotonic() - start_time > MAX_RUNTIME_SECONDS:
            logging.warning("Runtime limit reached. Stopping script."); break

        result_log = process_and_upload_file(gdrive_service, nextcloud_client, file_info, original_gdrive_path)

        dng_id_to_update = result_log['dng_id']
        master_log_df = master_log_df[master_log_df['dng_id'] != dng_id_to_update]

        new_row_df = pd.DataFrame([result_log])
        master_log_df = pd.concat([master_log_df, new_row_df], ignore_index=True)
        master_log_df.to_csv(MASTER_LOG_CSV, index=False)

        log_file_id = update_log_on_drive(gdrive_service, log_file_id, LOG_FOLDER_ID, MASTER_LOG_CSV)

    logging.info("✅ Processing complete.")

if __name__ == "__main__":
    main()



tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.provenance'
tar: Ignoring unknown extended header keyword 'LIBARCH

INFO:root:Successfully downloaded log file from Drive to 'master_log.csv'.
INFO:root:Status counts in master log before processing:
INFO:root:  SUCCESS: 3361
INFO:root:  API_FAIL_UPLOADED: 1045
INFO:root:  NEGATIVE_HST_HSS_SKIP: 151
INFO:root:  COMPRESS_FAIL: 4
INFO:root:Found 4561 total DNG files. 4 files to process (including reprocess).


Processing DNG Files:   0%|          | 0/4 [00:00<?, ?it/s]

ERROR:root:Failed to compress DJI_20250721183238_0066_D.DNG. Skipping.
  master_log_df = pd.concat([master_log_df, new_row_df], ignore_index=True)
ERROR:root:Failed to compress DJI_20250721183334_0069_D.DNG. Skipping.
ERROR:root:Failed to compress DJI_20250721183311_0068_D.DNG. Skipping.
ERROR:root:Failed to compress DJI_20250715112301_0015_D.DNG. Skipping.
INFO:root:✅ Processing complete.


# Data Enhance

In [1]:
import os
import zipfile
import shutil
import random
import yaml
import cv2
from pathlib import Path
from PIL import Image, ImageEnhance

# ======== KONFIGURASI (Colab compatible) ========
import sys
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    # Install required packages
    !pip install pillow pyyaml opencv-python --quiet
    from google.colab import files
    images_zip = "/content/images.zip"
    labels_zip = "/content/labels.zip"
    data_yaml = "/content/data.yaml"
    output_dir = "/content/balanced_dataset"
    splits = ["train", "val", "test"]
    random.seed(42)
    print("Please upload images.zip, labels.zip, and data.yaml if not already present in /content.")
    for fname in [images_zip, labels_zip, data_yaml]:
        if not os.path.exists(fname):
            print(f"Upload {os.path.basename(fname)}:")
            files.upload()
else:
    images_zip = "D:\\TBE\\AI\\trainerV0.10\\enhanced data\\images.zip"
    labels_zip = "D:\\TBE\\AI\\trainerV0.10\\enhanced data\\labels.zip"
    data_yaml = "D:\\TBE\\AI\\trainerV0.10\\enhanced data\\data.yaml"
    output_dir = "D:\\TBE\\AI\\trainerV0.10\\enhanced data\\balanced_dataset"
    splits = ["train", "val", "test"]
    random.seed(42)
# =============================

# --- Utilitas path ---
def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def rel_from_labels_root(label_path: Path, labels_root: Path) -> Path:
    """Relatifkan path label terhadap root 'labels' pertama."""
    rel = label_path.relative_to(labels_root)
    # Jika di dalam zip ada nested 'labels/...', buang lapisan pertama.
    parts = rel.parts
    if parts and parts[0].lower() == "labels":
        rel = Path(*parts[1:])
    return rel  # contoh: train/sub/xxx.txt atau xxx.txt

def find_image_for_label(label_path: Path, labels_root: Path, images_root: Path):
    """Cari gambar pendamping untuk satu file label, robust terhadap nested 'images/' di zip."""
    rel = rel_from_labels_root(label_path, labels_root)      # train/xxx.txt
    stem = rel.with_suffix("")                               # train/xxx
    # Kandidat root image: images_root/..., images_root/images/...
    candidate_roots = [images_root, images_root / "images"]
    # Uji beberapa ekstensi
    for base in candidate_roots:
        for ext in [".jpg", ".jpeg", ".png", ".JPG", ".PNG"]:
            cand = base / stem.with_suffix(ext)
            if cand.exists():
                return cand
    return None

def split_of_label(label_rel: Path):
    """Ambil split (train/val/test) dari path relatif label, jika ada."""
    return label_rel.parts[0] if len(label_rel.parts) >= 2 else None

# ==== Fungsi augmentasi photometric (aman untuk label) ====
def augment_image_photometric(img_path: Path) -> Image.Image:
    img = Image.open(str(img_path)).convert("RGB")
    if random.random() < 0.9:
        img = ImageEnhance.Brightness(img).enhance(random.uniform(0.8, 1.25))
    if random.random() < 0.9:
        img = ImageEnhance.Contrast(img).enhance(random.uniform(0.85, 1.2))
    return img

# ==== IO Label ====
def read_yolo_labels(lbl_path: Path):
    items = []
    if not lbl_path.exists():
        return items
    with open(lbl_path, "r") as f:
        for ln in f:
            p = ln.strip().split()
            if len(p) >= 5:
                cid = int(p[0]); cx, cy, w, h = map(float, p[1:5])
                items.append((cid, cx, cy, w, h))
    return items

def write_yolo_labels(lbl_path: Path, labels):
    with open(lbl_path, "w") as f:
        for (cid, cx, cy, w, h) in labels:
            f.write(f"{cid} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}\n")

# ==== Copy-paste objek ke file baru ====
def copy_paste_object_to_new_file(src_img_path: Path, src_lbl_path: Path,
                                  tgt_img_path: Path, tgt_lbl_path: Path,
                                  out_img_path: Path, out_lbl_path: Path,
                                  target_cid: int):
    # Baca sumber & target
    img_src = cv2.imread(str(src_img_path))
    img_tgt = cv2.imread(str(tgt_img_path))
    if img_src is None or img_tgt is None:
        return False

    h_src, w_src = img_src.shape[:2]
    h_tgt, w_tgt = img_tgt.shape[:2]

    # Salin label target ke file output lebih dulu
    tgt_labels = read_yolo_labels(tgt_lbl_path)
    write_yolo_labels(out_lbl_path, tgt_labels)

    # Ambil semua bbox class target dari sumber
    src_labels = read_yolo_labels(src_lbl_path)
    src_boxes = [(cid, cx, cy, w, h) for (cid, cx, cy, w, h) in src_labels if cid == target_cid]
    if not src_boxes:
        # Tidak ada object class ini di sumber
        cv2.imwrite(str(out_img_path), img_tgt)  # tetap salin gambar target
        return True

    # Pilih salah satu bbox untuk ditempel
    cid, cx, cy, w, h = random.choice(src_boxes)
    # Konversi ke piksel
    x1 = int((cx - w/2) * w_src); y1 = int((cy - h/2) * h_src)
    x2 = int((cx + w/2) * w_src); y2 = int((cy + h/2) * h_src)
    x1 = max(0, min(x1, w_src - 1)); y1 = max(0, min(y1, h_src - 1))
    x2 = max(1, min(x2, w_src));     y2 = max(1, min(y2, h_src))
    if x2 <= x1 or y2 <= y1:
        cv2.imwrite(str(out_img_path), img_tgt)
        return True

    crop = img_src[y1:y2, x1:x2]
    if crop.size == 0:
        cv2.imwrite(str(out_img_path), img_tgt)
        return True

    # Skala & posisi acak
    scale = random.uniform(0.6, 1.2)
    new_w = max(1, int(crop.shape[1] * scale))
    new_h = max(1, int(crop.shape[0] * scale))
    crop = cv2.resize(crop, (new_w, new_h))
    if new_w >= w_tgt or new_h >= h_tgt:
        new_w = min(new_w, w_tgt - 1); new_h = min(new_h, h_tgt - 1)
        crop = cv2.resize(crop, (new_w, new_h))
    tx = random.randint(0, w_tgt - new_w)
    ty = random.randint(0, h_tgt - new_h)

    # Tempel ke salinan target
    img_out = img_tgt.copy()
    img_out[ty:ty+new_h, tx:tx+new_w] = crop

    # Tulis bbox baru
    new_cx = (tx + new_w/2) / w_tgt
    new_cy = (ty + new_h/2) / h_tgt
    new_bw = new_w / w_tgt
    new_bh = new_h / h_tgt

    with open(out_lbl_path, "a") as f:
        f.write(f"{cid} {new_cx:.6f} {new_cy:.6f} {new_bw:.6f} {new_bh:.6f}\n")

    # Simpan gambar output
    cv2.imwrite(str(out_img_path), img_out)
    return True

# ====== 1) Ekstrak zip ======
images_root = Path("images")
labels_root = Path("labels")
shutil.rmtree(images_root, ignore_errors=True)
shutil.rmtree(labels_root, ignore_errors=True)
ensure_dir(images_root); ensure_dir(labels_root)

with zipfile.ZipFile(images_zip, "r") as z: z.extractall(images_root)
with zipfile.ZipFile(labels_zip, "r") as z: z.extractall(labels_root)

# ====== 2) Baca data.yaml ======
with open(data_yaml, "r") as f:
    cfg = yaml.safe_load(f)
names = cfg.get("names", [])
num_classes = len(names)

# ====== 3) Hitung jumlah anotasi per kelas ======
label_counts = {i: 0 for i in range(num_classes)}
all_label_files = [p for p in labels_root.rglob("*.txt")]
for lp in all_label_files:
    for (cid, *_rest) in read_yolo_labels(lp):
        label_counts[cid] += 1

print("Jumlah anotasi per kelas:")
for cid in range(num_classes):
    print(f"{cid} ({names[cid] if cid < len(names) else cid}): {label_counts.get(cid,0)}")

max_count = max(label_counts.values()) if label_counts else 0

# ====== 4) Siapkan output (salin dasar) ======
out_img_root = Path(output_dir) / "images"
out_lbl_root = Path(output_dir) / "labels"
if Path(output_dir).exists():
    shutil.rmtree(output_dir)
for sub in ["images", "labels"]:
    for sp in splits:
        ensure_dir(Path(output_dir) / sub / sp)

# salin seluruh isi asal → output, tetap menjaga struktur (kalau ada nested 'images/' atau 'labels/' akan ikut)
# tapi yang dibutuhkan YOLO adalah .../images/train,.../labels/train
# Jadi kita coba salin konten split jika ada; jika tidak, salin semuanya ke root split 'train'
for sp in splits:
    # cari direktori split di labels (wajib untuk mapping)
    cand_lbl_dirs = [labels_root / sp, labels_root / "labels" / sp]
    lbl_dir = next((d for d in cand_lbl_dirs if d.exists()), None)
    cand_img_dirs = [images_root / sp, images_root / "images" / sp]
    img_dir = next((d for d in cand_img_dirs if d.exists()), None)
    if lbl_dir and img_dir:
        shutil.copytree(img_dir, out_img_root / sp, dirs_exist_ok=True)
        shutil.copytree(lbl_dir, out_lbl_root / sp, dirs_exist_ok=True)

# Jika tidak ada struktur split, salin semua file ke split 'train'
if not any((out_lbl_root / sp).glob("**/*.txt") for sp in splits):
    ensure_dir(out_img_root / "train"); ensure_dir(out_lbl_root / "train")
    # salin semua gambar
    for p in images_root.rglob("*.*"):
        if p.suffix.lower() in [".jpg", ".jpeg", ".png"]:
            shutil.copy2(p, out_img_root / "train" / p.name)
    # salin semua label
    for p in labels_root.rglob("*.txt"):
        shutil.copy2(p, out_lbl_root / "train" / p.name)

# Re-scan label files dari OUTPUT (agar penamaan split konsisten)
out_all_lbl_files = [p for p in out_lbl_root.rglob("*.txt")]

# ====== 5) Oversample + augmentasi ======
for cid, count in label_counts.items():
    if count == 0:
        print(f"⚠ Lewati kelas {cid} ({names[cid] if cid < len(names) else cid}), tidak ada data sumber.")
        continue
    if count >= max_count:
        continue

    # file label yang memuat class ini (berbasis OUTPUT agar path split terjaga)
    files_for_cid = []
    for lp in out_all_lbl_files:
        if any(int(line.split()[0]) == cid for line in open(lp, "r")):
            files_for_cid.append(lp)
    if not files_for_cid:
        print(f"⚠ Tidak menemukan file untuk kelas {cid}, lewati.")
        continue

    needed = max_count - count
    print(f"Menambah kelas {cid} ({names[cid] if cid < len(names) else cid}): {needed} sampel tambahan")

    for i in range(needed):
        src_lbl = random.choice(files_for_cid)
        # tentukan split & path image sumber
        src_rel = rel_from_labels_root(src_lbl, out_lbl_root)
        sp = split_of_label(src_rel) or "train"
        src_img = find_image_for_label(src_lbl, out_lbl_root, out_img_root)
        if src_img is None or not src_img.exists():
            continue

        # buat nama output baru
        base = src_img.stem
        out_img_dir = out_img_root / sp
        out_lbl_dir = out_lbl_root / sp
        ensure_dir(out_img_dir); ensure_dir(out_lbl_dir)
        out_img = out_img_dir / f"{base}_aug_{cid}_{i}.jpg"
        out_lbl = out_lbl_dir / f"{base}_aug_{cid}_{i}.txt"

        # 50% photometric augment
        if random.random() < 0.5:
            img_aug = augment_image_photometric(src_img)
            img_aug.save(out_img, quality=90)
            # salin label sumber ke label baru (aman krn photometric)
            shutil.copy2(src_lbl, out_lbl)
        else:
            # copy-paste ke gambar target acak → simpan sebagai file baru
            tgt_lbl = random.choice(out_all_lbl_files)
            tgt_img = find_image_for_label(tgt_lbl, out_lbl_root, out_img_root)
            if tgt_img is None or not tgt_img.exists():
                # fallback: photometric saja
                img_aug = augment_image_photometric(src_img)
                img_aug.save(out_img, quality=90)
                shutil.copy2(src_lbl, out_lbl)
            else:
                ok = copy_paste_object_to_new_file(src_img, src_lbl, tgt_img, tgt_lbl, out_img, out_lbl, cid)
                if not ok:
                    # fallback kalau gagal
                    img_aug = augment_image_photometric(src_img)
                    img_aug.save(out_img, quality=90)
                    shutil.copy2(src_lbl, out_lbl)

print(f"\n✅ Selesai. Dataset seimbang + augmentasi tersimpan di: {output_dir}")


Please upload images.zip, labels.zip, and data.yaml if not already present in /content.
Jumlah anotasi per kelas:
0 (rumpun_padi_sehat): 3927
1 (rumpun_padi_tidak_sehat): 212
2 (tanaman_liar): 93
Menambah kelas 1 (rumpun_padi_tidak_sehat): 3715 sampel tambahan
Menambah kelas 2 (tanaman_liar): 3834 sampel tambahan


OSError: [Errno 28] No space left on device

# Train YOLO using Ultralytics

In [1]:
!nvidia-smi

Fri Sep 12 14:14:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# ==============================================================================
# 1. INSTALASI DAN IMPORT LIBRARY
# ==============================================================================
print("🚀 Menginstal library yang dibutuhkan...")
!pip install ultralytics webdavclient3 -q

import os
import random
import yaml
from getpass import getpass
from pathlib import Path
from tqdm.notebook import tqdm
from google.colab import userdata
import webdav3.client as wc

print("\n✅ Library berhasil diinstal dan diimpor.")

# ==============================================================================
# 2. KONFIGURASI KONEKSI KE NEXTCLOUD
# ==============================================================================
print("\n☁️ Menghubungkan ke Nextcloud...")
try:
    NEXTCLOUD_HOSTNAME = userdata.get("NEXTCLOUD_HOSTNAME")
    NEXTCLOUD_USERNAME = userdata.get("NEXTCLOUD_USERNAME")
    NEXTCLOUD_PASSWORD = userdata.get("NEXTCLOUD_PASSWORD")

    options = {
        'webdav_hostname': NEXTCLOUD_HOSTNAME,
        'webdav_login': NEXTCLOUD_USERNAME,
        'webdav_password': NEXTCLOUD_PASSWORD
    }
    client = wc.Client(options)
    client.verify = True # Set to False if you have SSL certificate issues

    # Uji koneksi dengan listing root directory
    client.list("/")
    print("✅ Koneksi ke Nextcloud berhasil!")
except Exception as e:
    print(f"❌ Gagal terhubung ke Nextcloud. Error: {e}")
    # Hentikan eksekusi jika koneksi gagal
    raise SystemExit("Koneksi Nextcloud gagal, periksa kembali secret Anda.")

🚀 Menginstal library yang dibutuhkan...
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for webdavclient3 (setup.py) ... [?25l[?25hdone

✅ Library berhasil diinstal dan diimpor.

☁️ Menghubungkan ke Nextcloud...
✅ Koneksi ke Nextcloud berhasil!


In [3]:
# ==============================================================================
# CEK STRUKTUR FOLDER ASLI DI NEXTCLOUD
# ==============================================================================
print("📂 Mengecek isi dari folder '/tbe/nanda/fotocompressed/images/train/' di Nextcloud...")

try:
    # Ganti path di bawah ini jika base path Anda berbeda
    base_folder_path = "/tbe/nanda/fotocompressed/images/train/"

    # List semua file dan folder di dalam base path
    folder_contents = client.list(base_folder_path)

    if not folder_contents:
        print("   ⚠️ Folder kosong atau tidak dapat diakses.")
    else:
        print("   ✅ Isi folder ditemukan:")
        for item in folder_contents:
            # Menghapus base path dari nama item agar lebih mudah dibaca
            clean_item_name = item.replace(base_folder_path, "", 1).lstrip("/")
            print(f"     - {clean_item_name}")

except Exception as e:
    print(f"   ❌ Gagal mengecek isi folder. Error: {e}")

📂 Mengecek isi dari folder '/tbe/nanda/fotocompressed/images/train/' di Nextcloud...
   ✅ Isi folder ditemukan:
     - train/
     - .DS_Store
     - DJI_Photo_14 Agustus 2025 - Kamurang_DJI_20250814105332_0475_D.jpeg
     - DJI_Photo_14 Agustus 2025 - Kamurang_DJI_20250814110021_0500_D.jpeg
     - DJI_Photo_23 Juli Pasir Tanjung_Monitoring Tanaman_DJI_20250723151414_0012_D.jpeg
     - DJI_Photo_27 Agustus Pasir Tanjung_DJI_20250827140859_0321_D.jpeg
     - DJI_Photo_27 Agustus Pasir Tanjung_DJI_20250827140913_0322_D.jpeg
     - DJI_Photo_27 Agustus Pasir Tanjung_DJI_20250827140922_0323_D.jpeg
     - DJI_Photo_27 Agustus Pasir Tanjung_DJI_20250827140927_0324_D.jpeg
     - DJI_Photo_27 Agustus Pasir Tanjung_DJI_20250827140934_0325_D.jpeg
     - DJI_Photo_27 Agustus Pasir Tanjung_DJI_20250827140941_0326_D.jpeg
     - DJI_Photo_27 Agustus Pasir Tanjung_DJI_20250827140948_0327_D.jpeg
     - DJI_Photo_27 Agustus Pasir Tanjung_DJI_20250827141000_0329_D.jpeg
     - DJI_Photo_27 Agustus Pasir 

In [4]:
# ==============================================================================
# FUNGSI SINKRONISASI (VERSI FINAL DENGAN LOGIKA STRING)
# ==============================================================================
def sync_nextcloud_folder(client, remote_path, local_path):
    """Mendownload semua file dari folder remote di Nextcloud ke folder lokal Colab."""
    local_path = Path(local_path)
    local_path.mkdir(parents=True, exist_ok=True)

    print(f"   Mulai sinkronisasi dari: {remote_path}")
    try:
        # Cek dulu apakah direktori remote benar-benar ada
        if not client.check(remote_path):
            print(f"   ❌ GAGAL: Folder remote '{remote_path}' tidak dapat ditemukan di server.")
            return

        all_items_in_remote = client.list(remote_path)

        # --- PERBAIKAN FINAL ---
        # Menghindari client.is_dir() dan menggunakan pengecekan string .endswith('/')
        # Ini jauh lebih andal karena tidak melakukan request baru ke server.
        files_to_download = [f for f in all_items_in_remote if not f.endswith('/')]
        # ---------------------

        if not files_to_download:
            print(f"   ⚠️ Peringatan: Tidak ada file yang ditemukan di dalam {remote_path}")
            return

        for remote_file_path in tqdm(files_to_download, desc=f"  -> Downloading {Path(remote_path).name}"):
            # Path file di server adalah gabungan dari path folder dan nama file
            full_remote_path = f"{remote_path.rstrip('/')}/{remote_file_path}"

            file_name = os.path.basename(remote_file_path)
            local_file_path = local_path / file_name

            if not local_file_path.exists():
                client.download_sync(remote_path=full_remote_path, local_path=str(local_file_path))

        print(f"   ✅ Selesai: {len(files_to_download)} file disinkronkan ke {local_path}")
    except Exception as e:
        print(f"   ❌ Gagal saat sinkronisasi {remote_path}. Error: {e}")


# ==============================================================================
# LOKASI PATH DI NEXTCLOUD DAN COLAB
# ==============================================================================
print("\n📂 Mempersiapkan struktur folder dan mengunduh dataset...")

# Path di Nextcloud
remote_base = "/tbe/nanda/fotocompressed"
remote_paths = {
    "images_train": f"{remote_base}/images/train",
    "labels_train": f"{remote_base}/labels/train",
    "images_val": f"{remote_base}/images/val",
    "labels_val": f"{remote_base}/labels/val",
    "classes_file": f"{remote_base}/keseragaman_rumpun_padi.txt"
}

# Path di Colab
local_base = Path("/content/dataset_padi")
local_paths = {
    "images_train": local_base / "images/train",
    "labels_train": local_base / "labels/train",
    "images_val": local_base / "images/val",
    "labels_val": local_base / "labels/val",
    "classes_file": "/content/classes.txt"
}

# ==============================================================================
# PROSES DOWNLOAD
# ==============================================================================
sync_nextcloud_folder(client, remote_paths["images_train"], local_paths["images_train"])
sync_nextcloud_folder(client, remote_paths["labels_train"], local_paths["labels_train"])
sync_nextcloud_folder(client, remote_paths["images_val"], local_paths["images_val"])
sync_nextcloud_folder(client, remote_paths["labels_val"], local_paths["labels_val"])

# Download file class
try:
    client.download_sync(remote_path=remote_paths["classes_file"], local_path=str(local_paths["classes_file"]))
    print(f"   ✅ File class berhasil diunduh ke {local_paths['classes_file']}")
except Exception as e:
    print(f"   ❌ Gagal mengunduh file class. Error: {e}")


print("\n✅ Semua data training dan validasi telah diunduh dan distrukturkan.")


📂 Mempersiapkan struktur folder dan mengunduh dataset...
   Mulai sinkronisasi dari: /tbe/nanda/fotocompressed/images/train


  -> Downloading train:   0%|          | 0/33 [00:00<?, ?it/s]

   ✅ Selesai: 33 file disinkronkan ke /content/dataset_padi/images/train
   Mulai sinkronisasi dari: /tbe/nanda/fotocompressed/labels/train


  -> Downloading train:   0%|          | 0/33 [00:00<?, ?it/s]

   ✅ Selesai: 33 file disinkronkan ke /content/dataset_padi/labels/train
   Mulai sinkronisasi dari: /tbe/nanda/fotocompressed/images/val


  -> Downloading val:   0%|          | 0/10 [00:00<?, ?it/s]

   ✅ Selesai: 10 file disinkronkan ke /content/dataset_padi/images/val
   Mulai sinkronisasi dari: /tbe/nanda/fotocompressed/labels/val


  -> Downloading val:   0%|          | 0/10 [00:00<?, ?it/s]

   ✅ Selesai: 10 file disinkronkan ke /content/dataset_padi/labels/val
   ✅ File class berhasil diunduh ke /content/classes.txt

✅ Semua data training dan validasi telah diunduh dan distrukturkan.


In [5]:
# ==============================================================================
# FUNGSI UNTUK MENGANALISIS DISTRIBUSI KELAS
# ==============================================================================
from collections import defaultdict
import pandas as pd

def analyze_class_distribution(label_dir_path):
    """Membaca semua file label dalam direktori dan menghitung instance per kelas."""
    class_counts = defaultdict(int)
    if not label_dir_path.exists():
        print(f"⚠️ Peringatan: Direktori tidak ditemukan: {label_dir_path}")
        return class_counts

    for label_file in label_dir_path.glob('*.txt'):
        with open(label_file, 'r') as f:
            for line in f:
                try:
                    # Ambil ID kelas (angka pertama di setiap baris)
                    class_id = int(line.split()[0])
                    class_counts[class_id] += 1
                except (ValueError, IndexError):
                    # Mengabaikan baris yang kosong atau formatnya salah
                    continue
    return class_counts

# ==============================================================================
# MENJALANKAN ANALISIS DAN MENAMPILKAN HASIL
# ==============================================================================
print("\n📊 Menganalisis distribusi kelas pada dataset lokal...")

try:
    # Pertama, baca nama kelas dari file yang sudah diunduh
    with open(local_paths["classes_file"], 'r') as f:
        class_names = [line.strip() for line in f if line.strip()]
    class_name_map = {i: name for i, name in enumerate(class_names)}

    # Analisis folder label train dan val
    train_counts = analyze_class_distribution(local_paths["labels_train"])
    val_counts = analyze_class_distribution(local_paths["labels_val"])

    # Siapkan data untuk ditampilkan dalam tabel yang rapi menggunakan pandas
    distribution_data = {
        'Class ID': [],
        'Class Name': [],
        'Training Set Count': [],
        'Validation Set Count': []
    }

    all_class_ids = sorted(list(set(train_counts.keys()) | set(val_counts.keys())))

    for cid in all_class_ids:
        distribution_data['Class ID'].append(cid)
        distribution_data['Class Name'].append(class_name_map.get(cid, 'Unknown'))
        distribution_data['Training Set Count'].append(train_counts.get(cid, 0))
        distribution_data['Validation Set Count'].append(val_counts.get(cid, 0))

    # Tampilkan tabel
    df = pd.DataFrame(distribution_data)
    print("\n✅ Analisis Selesai. Berikut adalah distribusinya:")
    print("--------------------------------------------------")
    print(df.to_string(index=False))
    print("--------------------------------------------------")

except Exception as e:
    print(f"❌ Gagal menganalisis distribusi kelas. Error: {e}")


📊 Menganalisis distribusi kelas pada dataset lokal...

✅ Analisis Selesai. Berikut adalah distribusinya:
--------------------------------------------------
 Class ID              Class Name  Training Set Count  Validation Set Count
        0       rumpun_padi_sehat                3123                   804
        1 rumpun_padi_tidak_sehat                 209                     3
        2            tanaman_liar                  82                    11
--------------------------------------------------


In [6]:
# ==============================================================================
# MEMBACA NAMA KELAS DARI FILE
# ==============================================================================
print("📝 Membuat file konfigurasi dataset (data.yaml)...")

try:
    with open(local_paths["classes_file"], 'r') as f:
        class_names = [line.strip() for line in f if line.strip()]

    num_classes = len(class_names)
    print(f"   Jumlah kelas terdeteksi: {num_classes}")
    print(f"   Nama kelas: {class_names}")

    # ==============================================================================
    # MEMBUAT KONTEN FILE YAML
    # ==============================================================================
    dataset_config = {
        'train': str(local_paths["images_train"].resolve()),
        'val': str(local_paths["images_val"].resolve()),
        'nc': num_classes,
        'names': class_names
    }

    config_path = local_base / "data.yaml"
    with open(config_path, 'w') as f:
        yaml.dump(dataset_config, f, default_flow_style=False, sort_keys=False)

    print(f"\n✅ File konfigurasi berhasil dibuat di: {config_path}")

    # Tampilkan isi file untuk verifikasi
    print("\nIsi file data.yaml:")
    print("---------------------")
    with open(config_path, 'r') as f:
        print(f.read())

except Exception as e:
    print(f"❌ Gagal membuat file konfigurasi. Error: {e}")

📝 Membuat file konfigurasi dataset (data.yaml)...
   Jumlah kelas terdeteksi: 3
   Nama kelas: ['rumpun_padi_sehat', 'rumpun_padi_tidak_sehat', 'tanaman_liar']

✅ File konfigurasi berhasil dibuat di: /content/dataset_padi/data.yaml

Isi file data.yaml:
---------------------
train: /content/dataset_padi/images/train
val: /content/dataset_padi/images/val
nc: 3
names:
- rumpun_padi_sehat
- rumpun_padi_tidak_sehat
- tanaman_liar



In [7]:
# ==============================================================================
# MENJALANKAN TRAINING SINGKAT
# ==============================================================================
print("\n🏋️‍♂️ Memulai baseline training (sanity check)...")
print("Model: yolo11s.pt")
print("Epochs: 50")
print("Image Size: 640x640")
print("--------------------------------------------------")

!yolo task=detect mode=train model=yolo11s.pt data={local_base}/data.yaml epochs=50 imgsz=640 project=runs/padi_detection name=baseline_sanity_check

print("\n✅ Sanity check selesai!")
print("Jika tidak ada error, berarti konfigurasi Anda sudah benar.")
print("Hasil training dapat dilihat di folder 'runs/padi_detection/baseline_sanity_check'")


🏋️‍♂️ Memulai baseline training (sanity check)...
Model: yolo11s.pt
Epochs: 5
Image Size: 640x640
--------------------------------------------------
Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11s.pt to 'yolo11s.pt': 100% ━━━━━━━━━━━━ 18.4MB 87.4MB/s 0.2s
Ultralytics 8.3.199 🚀 Python-3.12.11 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/content/dataset_padi/dat

In [10]:
!yolo val model=/content/runs/padi_detection/baseline_sanity_check/weights/best.pt data={local_base}/data.yaml max_det=2000

Ultralytics 8.3.199 🚀 Python-3.12.11 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
YOLO11s summary (fused): 100 layers, 9,413,961 parameters, 0 gradients, 21.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1652.5±270.1 MB/s, size: 5036.0 KB)
[K[34m[1mval: [0mScanning /content/dataset_padi/labels/val.cache... 9 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 9/9 164.8Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 1/1 0.3it/s 3.3s
                   all          9        818      0.757      0.121     0.0843     0.0315
     rumpun_padi_sehat          9        804       0.27      0.363      0.229     0.0866
rumpun_padi_tidak_sehat          1          3          1          0     0.0238    0.00784
          tanaman_liar          3         11          1          0          0          0
Speed: 0.2ms preprocess, 18.8ms inference, 0.0ms loss, 22.2ms postprocess per image
Results saved to [

In [8]:
# ==============================================================================
# 1. MENGUMPULKAN DAFTAR FILE TRAIN/VAL UNTUK MENGHINDARI KEBOCORAN DATA
# ==============================================================================
print("🧪 Mempersiapkan test set...")
train_val_files = set()
for folder in [local_paths["images_train"], local_paths["images_val"]]:
    for f in folder.glob('*'):
        train_val_files.add(f.stem)

print(f"   Ditemukan {len(train_val_files)} file unik di set train/val.")

# ==============================================================================
# 2. MENCARI DAN MEMILIH FILE TEST DARI FOLDER /tbe/ato/FAILED
# ==============================================================================
# Langsung menargetkan folder FAILED
remote_test_folder = "/tbe/ato/FAILED"
print(f"   Mencari kandidat file test di Nextcloud path: {remote_test_folder}")

try:
    # Ambil daftar semua file gambar langsung dari folder target
    images_in_folder = [f for f in client.list(remote_test_folder) if any(f.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png'])]

    if not images_in_folder:
        raise FileNotFoundError(f"Tidak ada file gambar yang ditemukan di {remote_test_folder}")

    print(f"   Ditemukan {len(images_in_folder)} total gambar di folder FAILED.")

    # Lakukan filter untuk menghindari data leakage
    valid_test_files = [img_path for img_path in images_in_folder if Path(img_path).stem not in train_val_files]

    if not valid_test_files:
        raise FileNotFoundError("Tidak ada gambar yang valid untuk test (semua gambar sudah ada di train/val set).")

    print(f"   Ditemukan {len(valid_test_files)} kandidat file test yang valid setelah filtering.")

    # Tentukan jumlah gambar yang akan diambil (maksimal 10)
    num_to_sample = min(10, len(valid_test_files))
    if len(valid_test_files) < 10:
        print(f"   ⚠️ Peringatan: Hanya ditemukan {len(valid_test_files)} file valid. Mengambil semua file yang ada.")

    # Ambil sampel acak dari daftar file yang valid
    selected_test_files = random.sample(valid_test_files, num_to_sample)
    print(f"   Memilih {len(selected_test_files)} file secara acak untuk diuji.")

    # ==============================================================================
    # 3. DOWNLOAD FILE TEST YANG SUDAH DIPILIH
    # ==============================================================================
    local_test_path = local_base / "images/test"
    local_test_path.mkdir(exist_ok=True, parents=True)

    for remote_file in tqdm(selected_test_files, desc="   -> Downloading test images"):
        client.download_sync(remote_path=remote_file, local_path=str(local_test_path / Path(remote_file).name))
    print(f"   ✅ {len(selected_test_files)} file test berhasil diunduh ke {local_test_path}")

    # ==============================================================================
    # 4. JALANKAN PREDIKSI/INFERENCE
    # ==============================================================================
    print("\n🔍 Menjalankan inference pada test set...")
    best_model_path = "runs/padi_detection/baseline_sanity_check/weights/best.pt"

    !yolo task=detect mode=predict model={best_model_path} source={local_test_path}

    print("\n🎉 Inference selesai!")
    print(f"Hasil prediksi dengan bounding box disimpan di dalam folder 'runs/padi_detection/'")

except Exception as e:
    print(f"❌ Gagal mempersiapkan test set atau menjalankan inference. Error: {e}")

🧪 Mempersiapkan test set...
   Ditemukan 42 file unik di set train/val.
   Mencari kandidat file test di Nextcloud path: /tbe/ato/FAILED
   Ditemukan 1045 total gambar di folder FAILED.
   Ditemukan 1045 kandidat file test yang valid setelah filtering.
   Memilih 10 file secara acak untuk diuji.


   -> Downloading test images:   0%|          | 0/10 [00:00<?, ?it/s]

❌ Gagal mempersiapkan test set atau menjalankan inference. Error: Remote resource: /DJI_Photo_6 Agustus Linggarsari_Monitoring Tanaman_DJI_20250806143753_0936_D.jpeg not found
