# Downloading from metadata and directly uploading to Roboflow



### Step 1: Install and import

In [49]:
#!pip install roboflow
import csv
import requests
import time
import tempfile
import os
import random
from io import BytesIO
from roboflow import Roboflow
from google.colab import drive
from concurrent.futures import ThreadPoolExecutor, as_completed

### Step 2: Mount Google Drive in Colab

In [21]:
drive.mount('/content/drive')

Mounted at /content/drive


### Step 3: Configuration

In [31]:
METADATA_FILE = "/content/drive/MyDrive/hornet_project/data/metadata/hornets_metadata.csv"

ROBOFLOW_API_KEY = "Tlb27HVkFzyExjB3VkA5"
WORKSPACE_ID = "hornets-kfaxc"
PROJECT_ID = "project-hornet-detection-bbpor"
BATCH_NAME = "hornet_upload_batch"

# License types to allow
ALLOWED_LICENSES = {
    "cc0",
    "by/4.0", "by/3.0", "by/2.5", "by/2.0",
    "by-sa/4.0", "by-sa/3.0", "by-sa/2.5",
    "by-nc/4.0", "by-nc/3.0", "by-nc/2.5",
    "by-nc-sa/4.0", "by-nc-sa/3.0", "by-nc-sa/2.5",
}


### Step 4: Define functions

In [40]:
# ==================================================
# DATA LOADING AND FILTERING
# ==================================================
def load_metadata(csv_file):
    """Load GBIF Darwin Core metadata file."""
    try:
        with open(csv_file, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter='\t')
            records = list(reader)
        print(f"Loaded {len(records)} records from {csv_file}")
        return records
    except FileNotFoundError:
        print(f"❌ Error: {csv_file} not found.")
        return []

def filter_by_license(records):
    """Keep only records with permissive licenses."""
    filtered = []
    for r in records:
        license_url = r.get('license', '').lower()
        is_allowed = any(p in license_url for p in ALLOWED_LICENSES)
        if is_allowed or license_url == '':
            filtered.append(r)
    print(f"Filtered to {len(filtered)} permissively licensed records")
    return filtered

def get_image_url(record):
    return record.get('identifier', '').strip()

# ==================================================
# ROBOFLOW UPLOAD
# ==================================================
def init_roboflow():
    print("\nConnecting to Roboflow...")
    rf = Roboflow(api_key=ROBOFLOW_API_KEY)
    project = rf.workspace(WORKSPACE_ID).project(PROJECT_ID)
    print(f"✅ Connected to: {WORKSPACE_ID}/{PROJECT_ID}")
    return project

def upload_image_from_url(project, url, idx, total):
    """Download image from URL, save temporarily, upload, and delete."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        # Create a temporary file (auto-deleted after upload)
        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
            tmp.write(response.content)
            tmp.flush()
            tmp_path = tmp.name

        # Upload from the temporary file
        project.upload(
            image_path=tmp_path,
            batch_name=BATCH_NAME,
            split="train",
            num_retry_uploads=3,
            tag_names=["hornet"],
            sequence_number=idx,
            sequence_size=total
        )

        os.remove(tmp_path)
        print(f"✅ [{idx}/{total}] Uploaded: {url[:60]}...")
        return True

    except requests.exceptions.RequestException as e:
        print(f"⚠️ [{idx}/{total}] Download failed: {e}")
        return False
    except Exception as e:
        print(f"⚠️ [{idx}/{total}] Upload failed: {e}")
        return False

### Step 5: Download and upload

In [48]:


# ==================================================
# MAIN
# ==================================================

def main(n=100, start_index=0, randomize=False):
    print("GBIF Hornet Downloader → Roboflow Uploader (parallel, no local save)")
    print("=" * 70)

    records = load_metadata(METADATA_FILE)
    if not records:
        return

    records = filter_by_license(records)
    project = init_roboflow()

    total = min(n, len(records))
    # Collect image URLs from filtered records
    urls = [get_image_url(r) for r in records[start_index:] if get_image_url(r)]

    # Shuffle if requested
    if randomize:
        random.shuffle(urls)

    # Take only n images if n is set
    if n is not None:
        urls = urls[:n]

    uploaded = 0
    failed = 0

    # Control how many threads (5–10 is usually safe)
    MAX_WORKERS = 5

    print(f"Starting parallel upload of {len(urls)} images using {MAX_WORKERS} threads...\n")

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {
            executor.submit(upload_image_from_url, project, url, idx + 1, total): url
            for idx, url in enumerate(urls)
        }

        for future in as_completed(futures):
            try:
                if future.result():
                    uploaded += 1
                else:
                    failed += 1
            except Exception as e:
                print(f"⚠️ Unexpected error: {e}")
                failed += 1

    print("\n" + "=" * 70)
    print(f"✅ Upload complete!")
    print(f"Uploaded: {uploaded}")
    print(f"Failed: {failed}")
    print(f"Total processed: {uploaded + failed}")



### Step 6: Run the functions

In [50]:
image_count = 100

main(n = 200, start_index = 100, randomize=True)

GBIF Hornet Downloader → Roboflow Uploader (parallel, no local save)
Loaded 167847 records from /content/drive/MyDrive/hornet_project/data/metadata/hornets_metadata.csv
Filtered to 76384 permissively licensed records

Connecting to Roboflow...
loading Roboflow workspace...
loading Roboflow project...
✅ Connected to: hornets-kfaxc/project-hornet-detection-bbpor
Starting parallel upload of 200 images using 5 threads...

✅ [4/200] Uploaded: https://observation.org/photos/56491045.jpg...
✅ [3/200] Uploaded: https://observation.org/photos/17101872.jpg...
✅ [2/200] Uploaded: https://inaturalist-open-data.s3.amazonaws.com/photos/247521...
✅ [1/200] Uploaded: https://inaturalist-open-data.s3.amazonaws.com/photos/326041...
✅ [5/200] Uploaded: https://inaturalist-open-data.s3.amazonaws.com/photos/474445...
✅ [7/200] Uploaded: https://inaturalist-open-data.s3.amazonaws.com/photos/113635...
✅ [6/200] Uploaded: https://inaturalist-open-data.s3.amazonaws.com/photos/554310...
✅ [10/200] Uploaded: htt