In [None]:
# Copyright 2024 NASA
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Rice mapping in Bhutan with U-Net using high resolution satellite imagery

### This notebook shows an example of counting the sample size from the `tfrecords`

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/SERVIR/servir-aces/blob/main/notebooks/count_sample_size.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/SERVIR/servir-aces/blob/main/notebooks/count_sample_size.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
</table>
</br>
</br>
</br>

This notebook is also available in this github repo: https://github.com/SERVIR/servir-aces. Navigate to the `notebooks` folder.

## Setup environment

In [None]:
!pip install servir-aces

Collecting servir-aces
  Downloading servir_aces-0.0.6-py2.py3-none-any.whl (31 kB)
Collecting apache-beam>=2.38.0 (from servir-aces)
  Downloading apache_beam-2.55.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
Collecting python-dotenv>=1.0.0 (from servir-aces)
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam>=2.38.0->servir-aces)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting orjson<4,>=3.9.7 (from apache-beam>=2.38.0->servir-aces)
  Downloading orjson-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.8/144.8 kB[0

In [None]:
!git clone https://github.com/SERVIR/servir-aces

Cloning into 'servir-aces'...
remote: Enumerating objects: 680, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 680 (delta 64), reused 62 (delta 49), pack-reused 573[K
Receiving objects: 100% (680/680), 898.59 KiB | 5.25 MiB/s, done.
Resolving deltas: 100% (384/384), done.


Now the repo is downloaded. We will create an environment file file to place point to our training data and customize parameters for the model. To do this, we make a copy of the `.env.example` file provided.

Under the hood, all the configuration provided via the environment file are parsed as a config object and can be accessed programatically.

Note current version does not expose all the model intracacies through the environment file but future version may include those depending on the need.

In [None]:
!cp servir-aces/.env.example servir-aces/config.env

Okay, now we have the `config.env` file, we will use this to provide our environments and parameters.

Note there are several parameters that can be changed. Let's start by changing the BASEDIR as below.

```
BASEDIR = "/content/"
```

We will download data for this chapter. We will use `datasets` dir to download the data. Let's go ahead and create that.

In [None]:
!mkdir /content/datasets

Let's go ahead and download the datasets for which we need to calculate the number of samples. They can be found at the google cloud storage and we will use `gsutil` to get the dataset in our workspace. Each folder/ dataset has `training`, `testing`, and `validation` subdirectory. Let's start by downloading these datasets in our workspace.

In [None]:
!gsutil -m cp -r gs://dl-book/chapter-1/* /content/datasets

Copying gs://dl-book/chapter-1/dnn_planet_wo_indices_w_s1_w_elevation/testing/testing.tfrecord.gz...
/ [0 files][    0.0 B/105.2 KiB]                                                Copying gs://dl-book/chapter-1/dnn_planet_wo_indices_w_s1_w_elevation/training/training.tfrecord.gz...
/ [0 files][    0.0 B/813.2 KiB]                                                Copying gs://dl-book/chapter-1/unet_256x256_planet_wo_indices_w_s1_w_elevation/testing/testing-00001-of-00038.tfrecord.gz...
/ [0 files][    0.0 B/ 90.9 MiB]                                                Copying gs://dl-book/chapter-1/dnn_planet_wo_indices_w_s1_w_elevation/validation/validation.tfrecord.gz...
/ [0 files][    0.0 B/ 91.1 MiB]                                                Copying gs://dl-book/chapter-1/unet_256x256_planet_wo_indices_w_s1_w_elevation/testing/testing-00000-of-00038.tfrecord.gz...
/ [0 files][    0.0 B/181.2 MiB]                                                Copying gs://dl-book/chapter-1/une

We will use the `unet_256x256_planet_wo_indices_w_s1_w_elevation` dataset inside the `dataset` folder for this exercise. Let's go ahead and change our DATADIR in the `config.env` file as below.

```
DATADIR = "datasets/unet_256x256_planet_wo_indices_w_s1_w_elevation"
```

These datasets have RGBN from Planetscope mosiac. Since we are trying to map the rice fields, we use growing season and pre-growing season information. Thus, we have 8 optical bands, namely `red_before`, `green_before`, `blue_before`, `nir_before`, `red_during`, `green_during`, `blue_during`, and  `nir_during`. In adidition, you can use `USE_ELEVATION` and `USE_S1` config to include the topographic and radar information. These datasets have toppgraphic and radar features, so we set these config values to True. Similarly, these datasets are tiled to 256x256 pixels, so let's also change that.

```
# For model training, USE_ELEVATION extends FEATURES with "elevation" & "slope"
# USE_S1 extends FEATURES with "vv_asc_before", "vh_asc_before", "vv_asc_during", "vh_asc_during",
# "vv_desc_before", "vh_desc_before", "vv_desc_during", "vh_desc_during"
# In case these are not useful and you have other bands in your training data, you can do set
# USE_ELEVATION and USE_S1 to False and update FEATURES to include needed bands
USE_ELEVATION = True
USE_S1 = True

PATCH_SHAPE = (256, 256)
```

Next, we need to calculate the size of the traiing, testing and validation dataset. For this, we know our size before hand. But let's use `aces` useful functionality to calculate this.

```
# Sizes of the training and evaluation datasets.
TRAIN_SIZE = 7700
TEST_SIZE = 1213
VAL_SIZE = 2404
```

In [None]:
from aces import Config
from aces import DataProcessor

In [None]:
config_file = "/content/servir-aces/config.env"
config = Config(config_file)

BASEDIR: /content
DATADIR: /content/datasets/unet_256x256_planet_wo_indices_w_s1_w_elevation
using features: ['red_before', 'green_before', 'blue_before', 'nir_before', 'red_during', 'green_during', 'blue_during', 'nir_during', 'elevation', 'slope', 'vv_asc_before', 'vh_asc_before', 'vv_asc_during', 'vh_asc_during', 'vv_desc_before', 'vh_desc_before', 'vv_desc_during', 'vh_desc_during']
using labels: ['class']


Most of the config in the `config.env` is now available via the config instance. Let's check few of them here.

In [None]:
config.TRAINING_DIR, config.BATCH_SIZE

(PosixPath('/content/datasets/unet_256x256_planet_wo_indices_w_s1_w_elevation/training'),
 64)

In [None]:
%%time
additional_config = {
    "PRINT_DATASET": True
}
n_training_records, n_testing_records, n_validation_records = DataProcessor.calculate_n_samples(**{**config.__dict__, **additional_config})


Training
inputs: float32 (256, 256, 18)
tf.Tensor(
[[[0.04035    0.0461     0.0259     ... 0.377637   0.33858082 0.34278086]
  [0.03675    0.044525   0.02385    ... 0.39470226 0.34890598 0.36348763]
  [0.0288     0.039725   0.022375   ... 0.40031958 0.3710302  0.3598775 ]
  ...
  [0.029625   0.044825   0.02755    ... 0.39187974 0.3469983  0.35823953]
  [0.027825   0.0424     0.0271     ... 0.3977692  0.35484934 0.3561558 ]
  [0.029975   0.046125   0.0272     ... 0.3897629  0.34793046 0.3612652 ]]

 [[0.0328     0.0432     0.0236     ... 0.3565247  0.326676   0.3392466 ]
  [0.0273     0.039525   0.0219     ... 0.3696133  0.3394726  0.3389661 ]
  [0.0321     0.04045    0.02235    ... 0.38849342 0.3522428  0.34435618]
  ...
  [0.029825   0.0453     0.02725    ... 0.38274664 0.3548652  0.3568154 ]
  [0.0305     0.04465    0.0276     ... 0.3804215  0.3527327  0.35734817]
  [0.03075    0.047175   0.027475   ... 0.3873024  0.3411553  0.35897118]]

 [[0.029575   0.04125    0.02265    ... 0.356

Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


CPU times: user 7min 15s, sys: 22.7 s, total: 7min 38s
Wall time: 7min 57s


In [None]:
print(f"no of training records: {n_training_records}")
print(f"no of testing records: {n_testing_records}")
print(f"no of validation records: {n_validation_records}")

no of training records: 7700
no of testing records: 1213
no of validation records: 2404
