# Preparing the data
This notebook shows how to tile up RGB and crown data ready for training.

## Mount drive to access data and install *detectree2* package.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!pip install git+https://github.com/PatBall1/detectree2.git@jb/july24

Mounted at /content/drive
Collecting git+https://github.com/PatBall1/detectree2.git@jb/july24
  Cloning https://github.com/PatBall1/detectree2.git (to revision jb/july24) to /tmp/pip-req-build-l90b45hk
  Running command git clone --filter=blob:none --quiet https://github.com/PatBall1/detectree2.git /tmp/pip-req-build-l90b45hk
  Running command git checkout -b jb/july24 --track origin/jb/july24
  Switched to a new branch 'jb/july24'
  Branch 'jb/july24' set up to track remote branch 'jb/july24' from 'origin'.
  Resolved https://github.com/PatBall1/detectree2.git to commit 065ad16a3d2473d25e7289ba84192ecb00d2d495
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting detectron2@ git+https://github.com/facebookresearch/detectron2.git (from detectree2==1.0.8)
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-install-ivilcyfl/detectron2_8d8709dbe4d74bc9b40219542ee644b8
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/

## Set parameters for tiling

In [2]:
from detectree2.preprocessing.tiling import tile_data, to_traintest_folders
import rasterio
import geopandas as gpd
import shutil
import time

# Set tiling parameters
buffer = 15
tile_width = 15
tile_height = 15
threshold = 0.7
appends = str(tile_width) + "_" + str(buffer) + "_" + str(threshold)

# dtype_bool requires True: BCI_2019, Paracou

## Tile up the data
Function to tile up the data into managable training chunks. This function has some issues around the encoding of the input raster. ```dtype_bool``` should be switched if black tiles are being produced. A recommended threshold is ~0.5 but it depends on volume of available data  (with abundant, dense crown data, a sticter threshold may be preferable).

In [None]:
crowns[~crowns.is_valid]

Unnamed: 0,fid_1,Site,PlotOrg,PlotNum,SubPlot,LocalID,CensusYear,CodeAlive,Family,Genus_Species,...,Lianas,StartDate,EndDate,GroundValid,Creator,Comments,BaseLayer,IDStatus,DBHest,geometry
219,7471,Paracou,CIRAD,5.0,3.0,398.0,2015.0,True,Clusiaceae,Symphonia_globulifera,...,False,,,True,Greg Vincent,398.0,Lidar2016,,,"MULTIPOLYGON (((286188.347 583007.643, 286189...."
4376,3281,Paracou,External,,,,,,,NA_NA,...,,,,,Manon,,,,,"MULTIPOLYGON (((286537.890 583781.031, 286536...."


In [3]:
### PARACOU MS 2023
site_path = "/content/drive/MyDrive/WORK/detectree2/data/Paracou"
img_path = site_path + "/ms/20230314_ORTHO_aligned_local.tif"
crown_path = site_path + "/crowns/240808_full_ms_2023.gpkg"
out_dir = "/content/drive/MyDrive/WORK/detectree2/data/Paracou" + '/tilesMS_' + appends + "/"

# Remove existing tile directory
shutil.rmtree(out_dir, True)

# Read in the tiff file
data = rasterio.open(img_path)

# Read in crowns (then filter by an attribute?)
crowns = gpd.read_file(crown_path)
crowns = crowns[crowns.is_valid]
crowns = crowns.to_crs(data.crs.data)

start_time = time.time()
tile_data(img_path, out_dir, buffer, tile_width, tile_height, crowns, threshold, mode="ms")
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

to_traintest_folders(out_dir, out_dir, test_frac=0, folds=5)

  in_crs_string = _prepare_from_proj_string(in_crs_string)


Elapsed time: 233.48648953437805 seconds


## Send geojson to train/test folders
Send geojsons to train folder (with folds for k-fold cross validation) and test folder. Training tiles will automatically be remove if there is any overlap with a test tile.

In [None]:
#from detectree2.preprocessing.tiling import to_traintest_folders
#out_folder = out_dir
to_traintest_folders(out_dir, out_dir, test_frac=0.0, folds=5)

## Visualise training data

Need to edit to register properly. Fixed in training script

In [None]:
import rasterio
from detectron2.utils.visualizer import Visualizer
from detectree2.models.train import combine_dicts
from detectron2.data import DatasetCatalog, MetadataCatalog
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt
from IPython.display import display

val_fold = 1
name = "Paracou"
tiles = "/tilesMS_" + appends + "/train"
train_location = "/content/drive/MyDrive/WORK/detectree2/data/" + name + tiles
dataset_dicts = combine_dicts(train_location, val_fold)
trees_metadata = MetadataCatalog.get(name + "_train")

# Function to normalize and convert multi-band image to RGB if needed
def prepare_image_for_visualization(image):
    if image.shape[2] == 3:
        # If the image has 3 bands, assume it's RGB
        image = np.stack([
            cv2.normalize(image[:, :, i], None, 0, 255, cv2.NORM_MINMAX)
            for i in range(3)
        ], axis=-1).astype(np.uint8)
    else:
        # If the image has more than 3 bands, choose the first 3 for visualization
        image = image[:, :, :3]  # Or select specific bands
        image = np.stack([
            cv2.normalize(image[:, :, i], None, 0, 255, cv2.NORM_MINMAX)
            for i in range(3)
        ], axis=-1).astype(np.uint8)

    return image

# Visualize each image in the dataset
for d in dataset_dicts:
    with rasterio.open(d["file_name"]) as src:
        img = src.read()  # Read all bands
        img = np.transpose(img, (1, 2, 0))  # Convert to HWC format
        img = prepare_image_for_visualization(img)  # Normalize and prepare for visualization

    visualizer = Visualizer(img[:, :, ::-1]*10, metadata=trees_metadata, scale=0.5)
    out = visualizer.draw_dataset_dict(d)
    image = out.get_image()[:, :, ::-1]
    display(Image.fromarray(image))