In [245]:
import geopandas as gpd 

In [246]:
tiles = gpd.read_file("../data/tiles.geojson")
trees_data = gpd.read_file("../data/trees_box.geojson")
trees_data


Unnamed: 0,osm_id,natural,species,species_mapped,lon,lat,geometry
0,5269771172,tree,Mangifera indica,Mango,-1.951858e+07,-2.402352e+06,"POLYGON ((-19518575.399 -2402355.495, -1951857..."
1,5269771173,tree,Mangifera indica,Mango,-1.951861e+07,-2.402405e+06,"POLYGON ((-19518607.203 -2402408.492, -1951860..."
2,5269771174,tree,Mangifera indica,Mango,-1.951861e+07,-2.402384e+06,"POLYGON ((-19518603.763 -2402387.303, -1951860..."
3,5269771175,tree,Mangifera indica,Mango,-1.951860e+07,-2.402469e+06,"POLYGON ((-19518601.08 -2402471.5, -19518601.0..."
4,5269771176,tree,Mangifera indica,Mango,-1.951860e+07,-2.402552e+06,"POLYGON ((-19518593.622 -2402554.803, -1951859..."
...,...,...,...,...,...,...,...
10626,5322704745,tree,Cocos nucifera,Coconut,-1.951967e+07,-2.404097e+06,"POLYGON ((-19519668.812 -2404099.661, -1951966..."
10627,5322704746,tree,Cocos nucifera,Coconut,-1.951964e+07,-2.404093e+06,"POLYGON ((-19519641.94 -2404095.64, -19519641...."
10628,5322704747,tree,Cocos nucifera,Coconut,-1.951963e+07,-2.404084e+06,"POLYGON ((-19519629.995 -2404086.977, -1951962..."
10629,5322704748,tree,Cocos nucifera,Coconut,-1.951964e+07,-2.404083e+06,"POLYGON ((-19519639.246 -2404086.226, -1951963..."


In [247]:
trees_data = trees_data[trees_data['species_mapped'] == 'Coconut'] # only keep coconut on the training

In [248]:
tiles.to_crs(epsg=4326, inplace=True)
trees_data.to_crs(epsg=4326, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [249]:
import re

def parse_tile_id(tile_id_str):
    match = re.match(r"Tile\(x=(\d+), y=(\d+), z=(\d+)\)", tile_id_str)
    if match:
        return match.groups()
    raise ValueError(f"Cannot parse tile ID: {tile_id_str}")

### converts it to geojson labels for each tiles

In [250]:
import os
from pathlib import Path

def split_geojson_by_tiles(trees_gdf, tiles_gdf, output_dir, prefix="OAM"):
    """Clip trees by tiles and save as individual GeoJSON files"""
    os.makedirs(output_dir, exist_ok=True)
    
    stats = {'processed': 0, 'skipped': 0, 'errors': 0, 'total_trees': 0}
    
    for idx, tile in tiles_gdf.iterrows():
        try:
            tile_geom = tile.geometry
            tile_id = tile['id']
            
            x, y, z = parse_tile_id(tile_id)
            tile_filename = f"{prefix}-{x}-{y}-{z}.geojson"
            
            intersecting_trees = trees_gdf[trees_gdf.intersects(tile_geom)].copy()
            
            if intersecting_trees.empty:
                stats['skipped'] += 1
                continue
            
            clipped_trees = gpd.clip(intersecting_trees, tile_geom)
            
            output_path = Path(output_dir) / tile_filename
            clipped_trees.to_file(output_path, driver="GeoJSON")
            
            stats['processed'] += 1
            stats['total_trees'] += len(clipped_trees)
            
        except Exception as e:
            print(f"Error processing tile {idx}: {e}")
            stats['errors'] += 1
            continue
    
    return stats

In [251]:
stats = split_geojson_by_tiles(trees_data, tiles, "../data/labels")

In [252]:
stats

{'processed': 448, 'skipped': 103, 'errors': 0, 'total_trees': 11726}

#### geojson Labels to YOLO Format

In [253]:
from glob import glob
import rasterio
import yaml

chips_dir = Path("../data/chips")
labels_dir = Path("../data/labels")
yolo_dir = Path("../data/yolo")
yolo_dir.mkdir(exist_ok=True)

yolo_labels_dir = Path(os.path.join(yolo_dir, "labels"))

yolo_labels_dir.mkdir(exist_ok=True)

label_files = sorted(labels_dir.glob("*.geojson"))


In [254]:
len(label_files)

458

In [255]:
classes = sorted(trees_data['species_mapped'].unique())
class_to_id = {cls: idx for idx, cls in enumerate(classes)}

print("Class mapping:")
for cls, idx in class_to_id.items():
    print(f"  {idx}: {cls}")

Class mapping:
  0: Coconut


In [256]:
def geojson_to_yolo(geojson_path, image_path, class_mapping):
    trees = gpd.read_file(geojson_path)
    
    with rasterio.open(image_path) as src:
        yolo_lines = []
        for _, tree in trees.iterrows():
            species = tree.get('species_mapped', 'Unknown')
            class_id = class_mapping.get(species, 0)
            
            minx, miny, maxx, maxy = tree.geometry.bounds
            
            top_py, top_px = src.index(minx, miny)
            bottom_py, bottom_px = src.index(maxx, maxy)
            
            center_x = (top_px + bottom_px) / 2 / src.width
            center_y = (top_py + bottom_py) / 2 / src.height
            width = abs(top_px - bottom_px) / src.width
            height = abs(top_py - bottom_py) / src.height
            
            yolo_lines.append(f"{class_id} {center_x:.6f} {center_y:.6f} {width:.6f} {height:.6f}")
    
    return yolo_lines

In [257]:
converted = 0
skipped = 0

for label_file in label_files:
    stem = label_file.stem
    image_file = chips_dir / f"{stem}.tif"
    
    if not image_file.exists():
        skipped += 1
        continue
    
    yolo_lines = geojson_to_yolo(label_file, image_file, class_to_id)
    
    yolo_file = yolo_labels_dir / f"{stem}.txt"
    with open(yolo_file, 'w') as f:
        f.write('\n'.join(yolo_lines))
    
    converted += 1

#### train val split 

In [258]:
import shutil
import pandas as pd
from PIL import Image

train_dir = yolo_dir / "train"
val_dir = yolo_dir / "val"
train_dir.mkdir(exist_ok=True)
val_dir.mkdir(exist_ok=True)

data = []
for label_file in labels_dir.glob("*.geojson"):
    trees = gpd.read_file(label_file)
    if not trees.empty:
        dominant_species = trees['species_mapped'].value_counts().idxmax()
        data.append({'file': label_file.stem, 'species': dominant_species})

df = pd.DataFrame(data)
train_df = df.groupby('species', group_keys=False).apply(lambda x: x.sample(frac=0.8, random_state=42))
val_df = df.drop(train_df.index)

for split_name, split_df, target_dir in [("train", train_df, train_dir), ("val", val_df, val_dir)]:
    for stem in split_df['file']:
        with rasterio.open(chips_dir / f"{stem}.tif") as src: Image.fromarray(src.read([1,2,3]).transpose(1,2,0)).save(target_dir / f"{stem}.png")
        shutil.copy(yolo_labels_dir / f"{stem}.txt", target_dir / f"{stem}.txt")

  train_df = df.groupby('species', group_keys=False).apply(lambda x: x.sample(frac=0.8, random_state=42))


In [259]:
print(f"Train: {len(train_df)} | Val: {len(val_df)}")

print(train_df['species'].value_counts().sort_index())

print(val_df['species'].value_counts().sort_index())

Train: 366 | Val: 92
species
Banana       2
Coconut    358
Mango        6
Name: count, dtype: int64
species
Coconut    90
Mango       2
Name: count, dtype: int64


In [260]:
data_config = {
    'path': str(yolo_dir.absolute()),
    'train': 'train',
    'val': 'val',
    'names': {idx: name for name, idx in class_to_id.items()}
}

config_file = yolo_dir / "config.yaml"
with open(config_file, 'w') as f:
    yaml.dump(data_config, f, sort_keys=False)
