In [1]:
import torchvision
import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import plotly.express as px

In [2]:
train_data = datasets.load_from_disk("../data/selvabox/train/", keep_in_memory=False)
test_data = datasets.load_from_disk("../data/selvabox/test/", keep_in_memory=False)
val_data = datasets.load_from_disk("../data/selvabox/validation/", keep_in_memory=False)

Loading dataset from disk:   0%|          | 0/34 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/24 [00:00<?, ?it/s]

In [3]:
train_data

Dataset({
    features: ['height', 'width', 'fold', 'raster_name', 'location', 'image', 'tile_name', 'annotations', 'tile_metadata'],
    num_rows: 585
})

In [4]:
test_data

Dataset({
    features: ['height', 'width', 'fold', 'raster_name', 'location', 'image', 'tile_name', 'annotations', 'tile_metadata'],
    num_rows: 1477
})

In [5]:
val_data

Dataset({
    features: ['height', 'width', 'fold', 'raster_name', 'location', 'image', 'tile_name', 'annotations', 'tile_metadata'],
    num_rows: 387
})

In [6]:
print(f"Training data image dimensions: {train_data['height'][0]} x {train_data['width'][0]}")
print(f"Testing data image dimensions: {test_data['height'][0]} x {test_data['width'][0]}")
print(f"Validation data image dimensions: {val_data['height'][0]} x {val_data['width'][0]}")

Training data image dimensions: 3555 x 3555
Testing data image dimensions: 1777 x 1777
Validation data image dimensions: 1777 x 1777


In [7]:
len(train_data)

585

In [8]:
sample = train_data[0]

In [9]:
sample.keys()

dict_keys(['height', 'width', 'fold', 'raster_name', 'location', 'image', 'tile_name', 'annotations', 'tile_metadata'])

In [10]:
for key in sample.keys():
    print(f"{key}: {type(sample[key])} - {sample[key]}")

height: <class 'int'> - 3555
width: <class 'int'> - 3555
fold: <class 'str'> - train
raster_name: <class 'str'> - 20230525_tbslake_m3e_rgb
location: <class 'str'> - ecuador_tiputini
image: <class 'PIL.TiffImagePlugin.TiffImageFile'> - <PIL.TiffImagePlugin.TiffImageFile image mode=RGBA size=3555x3555 at 0x193CC87D1D0>
tile_name: <class 'str'> - 20230525_tbslake_m3e_rgb_tile_train_3555_gr0p045_0_1777.tif
annotations: <class 'dict'> - {'bbox': [[2930.151310094632, 2180.757164001465, 107.60720480792224, 113.03430035710335], [3432.975268027745, 2332.2232190966606, 112.52794121392071, 131.95229616761208], [3059.6135603953153, 1603.664231389761, 255.30170987639576, 325.27222537994385], [3493.773807953112, 2151.9102770090103, 61.22619204688817, 132.41821643710136], [2904.3439283324406, 1500.690564841032, 316.0675247833133, 296.3541029393673], [2816.0766307553276, 1902.6750538647175, 129.87280925828964, 145.46828186511993], [2735.8820028621703, 1615.7467215061188, 126.90226314309984, 133.183913

In [11]:
print(sample["annotations"].keys())

dict_keys(['bbox', 'segmentation', 'area', 'iscrowd', 'is_rle_format', 'category'])


In [12]:
for key in sample['annotations'].keys():
    print(f"{key}: {type(sample['annotations'][key])} - {sample['annotations'][key]}")

bbox: <class 'list'> - [[2930.151310094632, 2180.757164001465, 107.60720480792224, 113.03430035710335], [3432.975268027745, 2332.2232190966606, 112.52794121392071, 131.95229616761208], [3059.6135603953153, 1603.664231389761, 255.30170987639576, 325.27222537994385], [3493.773807953112, 2151.9102770090103, 61.22619204688817, 132.41821643710136], [2904.3439283324406, 1500.690564841032, 316.0675247833133, 296.3541029393673], [2816.0766307553276, 1902.6750538647175, 129.87280925828964, 145.46828186511993], [2735.8820028621703, 1615.7467215061188, 126.90226314309984, 133.1839136481285], [2719.576011052355, 2784.149450659752, 242.16928306780756, 215.45600160956383], [2168.5565337734297, 1674.8417764008045, 104.14687456190586, 122.09993535280228], [1825.2468949714676, 2603.0990389585495, 131.11514009721577, 170.74338164925575], [3236.428411923349, 2632.7522292733192, 318.5715880766511, 498.5364902253441], [2111.465094540268, 2937.6422178149223, 285.2175970058888, 284.4903998076916], [1958.3941

In [13]:
samples_list = []
samples_annotations = {}

for sample in tqdm(train_data):
    sample_dict = {}
    
    sample_dict["height"] = sample["height"]
    sample_dict["width"] = sample["width"]
    sample_dict["fold"] = sample["fold"]
    sample_dict["raster_name"] = sample["raster_name"]
    sample_dict["location"] = sample["location"]
    sample_dict["tile_name"] = sample["tile_name"]

    samples_list.append(sample_dict)
    samples_annotations[sample["tile_name"]] = sample["annotations"]

  0%|          | 0/585 [00:00<?, ?it/s]

In [14]:
metadata_df = pd.DataFrame(samples_list)
annotations_df = pd.DataFrame.from_dict(samples_annotations, orient='index')

In [15]:
metadata_df.head()

Unnamed: 0,height,width,fold,raster_name,location,tile_name
0,3555,3555,train,20230525_tbslake_m3e_rgb,ecuador_tiputini,20230525_tbslake_m3e_rgb_tile_train_3555_gr0p0...
1,3555,3555,train,20230525_tbslake_m3e_rgb,ecuador_tiputini,20230525_tbslake_m3e_rgb_tile_train_3555_gr0p0...
2,3555,3555,train,20230525_tbslake_m3e_rgb,ecuador_tiputini,20230525_tbslake_m3e_rgb_tile_train_3555_gr0p0...
3,3555,3555,train,20230525_tbslake_m3e_rgb,ecuador_tiputini,20230525_tbslake_m3e_rgb_tile_train_3555_gr0p0...
4,3555,3555,train,20230525_tbslake_m3e_rgb,ecuador_tiputini,20230525_tbslake_m3e_rgb_tile_train_3555_gr0p0...


In [16]:
metadata_df['height'].value_counts()

height
3555    585
Name: count, dtype: int64

In [17]:
metadata_df['width'].value_counts()

width
3555    585
Name: count, dtype: int64

In [18]:
metadata_df['fold'].value_counts()

fold
train    585
Name: count, dtype: int64

In [19]:
metadata_df['raster_name'].value_counts()

raster_name
20231018_terrafirme_m3e_rgb                 121
20240131_zf2campirana_m3m_rgb               117
20170810_transectotoni_mavicpro_rgb          88
20231018_pantano_m3e_rgb                     65
20231208_asforestsouth2_m3m_rgb              58
20231018_inundated_m3e_rgb                   38
20231207_asnortheast_amsunclouds_m3m_rgb     38
20231207_asnorthnorth_pmclouds_m3m_rgb       34
20230525_tbslake_m3e_rgb                     14
20230911_sanitower_mini2_rgb                 10
20240130_zf2transectew_m3m_rgb                2
Name: count, dtype: int64

In [20]:
metadata_df['tile_name'].value_counts()

tile_name
20230525_tbslake_m3e_rgb_tile_train_3555_gr0p045_0_1777.tif                       1
20240131_zf2campirana_m3m_rgb_tile_train_3555_gr0p045_1777_5331.tif               1
20240131_zf2campirana_m3m_rgb_tile_train_3555_gr0p045_15993_7108.tif              1
20240131_zf2campirana_m3m_rgb_tile_train_3555_gr0p045_15993_8885.tif              1
20240131_zf2campirana_m3m_rgb_tile_train_3555_gr0p045_17770_10662.tif             1
                                                                                 ..
20231018_pantano_m3e_rgb_tile_train_3555_gr0p045_8885_3554.tif                    1
20231018_pantano_m3e_rgb_tile_train_3555_gr0p045_8885_5331.tif                    1
20231018_pantano_m3e_rgb_tile_train_3555_gr0p045_8885_7108.tif                    1
20231018_pantano_m3e_rgb_tile_train_3555_gr0p045_8885_8885.tif                    1
20231207_asnortheast_amsunclouds_m3m_rgb_tile_train_3555_gr0p045_8885_7108.tif    1
Name: count, Length: 585, dtype: int64

In [21]:
metadata_df['location'].value_counts()

location
ecuador_tiputini    336
panama_aguasalud    130
brazil_zf2          119
Name: count, dtype: int64

In [22]:
annotations_df.head()

Unnamed: 0,bbox,segmentation,area,iscrowd,is_rle_format,category
20230525_tbslake_m3e_rgb_tile_train_3555_gr0p045_0_1777.tif,"[[2930.151310094632, 2180.757164001465, 107.60...",,"[12157.803592088228, 14829.145932770049, 82987...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[tree, tree, tree, tree, tree, tree, tree, tre..."
20230525_tbslake_m3e_rgb_tile_train_3555_gr0p045_0_3554.tif,"[[2930.151310094632, 403.75716400146484, 107.6...",,"[12157.803592088228, 14829.145932770049, 63483...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[tree, tree, tree, tree, tree, tree, tree, tre..."
20230525_tbslake_m3e_rgb_tile_train_3555_gr0p045_0_5331.tif,"[[3018.6747251050547, 1840.1160660684109, 265....",,"[71907.09806858795, 100821.94394369527, 57799....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[tree, tree, tree, tree, tree, tree, tree, tre..."
20230525_tbslake_m3e_rgb_tile_train_3555_gr0p045_0_7108.tif,"[[3018.6747251050547, 63.11606606841087, 265.1...",,"[71907.09806858795, 38930.06399058635, 100990....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[tree, tree, tree, tree, tree, tree, tree, tre..."
20230525_tbslake_m3e_rgb_tile_train_3555_gr0p045_1777_0.tif,"[[1282.6135603953153, 3380.664231389761, 255.2...",,"[44471.417920004984, 235725.05219921813, 74762...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[tree, tree, tree, tree, tree, tree, tree, tre..."


In [23]:
trees_area = []

for k, v in tqdm(samples_annotations.items(), total=len(samples_annotations)):
    trees_area.extend(v["area"])

  0%|          | 0/585 [00:00<?, ?it/s]

In [24]:
len(trees_area)

232071

In [25]:
trees_area = np.array(trees_area)

fig = px.histogram(
    trees_area,
    nbins=100,
    title="Distribution of Tree Areas",
    labels={"value": "Area (in pixels)", "count": "Number of Trees"},
)

fig.update_layout(bargap=0.1)
fig.show()