In [None]:
# Copyright 2024 NASA
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Rice mapping in Bhutan with U-Net using high resolution satellite imagery

### This notebook shows an example of counting the sample size from the `tfrecords`

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/SERVIR/servir-aces/blob/main/notebooks/count_sample_size.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/SERVIR/servir-aces/blob/main/notebooks/count_sample_size.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
</table>
</br>
</br>
</br>

This notebook is also available in this github repo: https://github.com/SERVIR/servir-aces. Navigate to the `notebooks` folder.

## Setup environment

In [None]:
!pip install servir-aces

Collecting servir-aces
  Downloading servir_aces-0.0.9-py2.py3-none-any.whl (31 kB)
Collecting apache-beam>=2.38.0 (from servir-aces)
  Downloading apache_beam-2.55.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
Collecting python-dotenv>=1.0.0 (from servir-aces)
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam>=2.38.0->servir-aces)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting orjson<4,>=3.9.7 (from apache-beam>=2.38.0->servir-aces)
  Downloading orjson-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (141 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.1/141.1 kB[0

In [None]:
!git clone https://github.com/SERVIR/servir-aces

Cloning into 'servir-aces'...
remote: Enumerating objects: 671, done.[K
remote: Counting objects: 100% (99/99), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 671 (delta 42), reused 69 (delta 38), pack-reused 572[K
Receiving objects: 100% (671/671), 2.33 MiB | 4.83 MiB/s, done.
Resolving deltas: 100% (419/419), done.


Now the repo is downloaded. We will create an environment file file to place point to our training data and customize parameters for the model. To do this, we make a copy of the `.env.example` file provided.

Under the hood, all the configuration provided via the environment file are parsed as a config object and can be accessed programatically.

Note current version does not expose all the model intracacies through the environment file but future version may include those depending on the need.

In [None]:
!cp servir-aces/.env.example servir-aces/config.env

## Setup config file variables

Okay, now we have the `config.env` file, we will use this to provide our environments and parameters.

Note there are several parameters that can be changed. Let's start by changing the BASEDIR as below.

```
BASEDIR = "/content/"
```

We will download data for this chapter. We will use `datasets` dir to download the data. Let's go ahead and create that.

In [None]:
!mkdir /content/datasets

Let's go ahead and download the datasets for which we need to calculate the number of samples. They can be found at the google cloud storage and we will use `gsutil` to get the dataset in our workspace. Each folder/ dataset has `training`, `testing`, and `validation` subdirectory. Let's start by downloading these datasets in our workspace.

In [None]:
!gsutil -m cp -r gs://dl-book/chapter-1/* /content/datasets

Copying gs://dl-book/chapter-1/.DS_Store...
Copying gs://dl-book/chapter-1/dnn_planet_wo_indices/testing/testing.tfrecord.gz...
/ [0 files][    0.0 B/  6.0 KiB]                                                / [0 files][    0.0 B/ 58.1 KiB]                                                Copying gs://dl-book/chapter-1/dnn_planet_wo_indices/training/training.tfrecord.gz...
/ [0 files][    0.0 B/410.4 KiB]                                                Copying gs://dl-book/chapter-1/images/image_202100000.tfrecord.gz...
/ [0 files][    0.0 B/ 50.8 MiB]                                                Copying gs://dl-book/chapter-1/images/image_202100003.tfrecord.gz...
/ [0 files][    0.0 B/ 96.7 MiB]                                                Copying gs://dl-book/chapter-1/images/image_202100002.tfrecord.gz...
/ [0 files][    0.0 B/143.8 MiB]                                                Copying gs://dl-book/chapter-1/images/image_202100005.tfrecord.gz...
/ [0 files][    0.0 B/14

We will use the `unet_256x256_planet_wo_indices` dataset inside the `dataset` folder for this exercise. Let's go ahead and change our DATADIR in the `config.env` file as below.

```
DATADIR = "datasets/unet_256x256_planet_wo_indices"
```

These datasets have RGBN from Planetscope mosiac. Since we are trying to map the rice fields, we use growing season and pre-growing season information. Thus, we have 8 optical bands, namely `red_before`, `green_before`, `blue_before`, `nir_before`, `red_during`, `green_during`, `blue_during`, and  `nir_during`. In adidition, you can use `USE_ELEVATION` and `USE_S1` config to include the topographic and radar information. Since currently we are not including these, so we won't be settting these config values. Similarly, these datasets are tiled to 256x256 pixels, so let's also change that.

```
# For model training, USE_ELEVATION extends FEATURES with "elevation" & "slope"
# USE_S1 extends FEATURES with "vv_asc_before", "vh_asc_before", "vv_asc_during", "vh_asc_during",
# "vv_desc_before", "vh_desc_before", "vv_desc_during", "vh_desc_during"
# In case these are not useful and you have other bands in your training data, you can do set
# USE_ELEVATION and USE_S1 to False and update FEATURES to include needed bands
USE_ELEVATION = False
USE_S1 = False

PATCH_SHAPE = (256, 256)
```

Next, we need to calculate the size of the traiing, testing and validation dataset. For this, we know our size before hand. But let's use `aces` useful functionality to calculate this.

```
# Sizes of the training and evaluation datasets.
TRAIN_SIZE = 8531
TEST_SIZE = 1222
VAL_SIZE = 2404
```

## Update the config file programtically

Let's make a dictionary so we can change these config settings programatically.

In [None]:
BASEDIR = "/content/" # @param {type:"string"}
DATADIR = "datasets/unet_256x256_planet_wo_indices" # @param {type:"string"}

USE_ELEVATION = "False" # @param {type:"string"}
USE_S1 = "False" # @param {type:"string"}
PATCH_SHAPE = "(256, 256)" # @param {type:"string"}

BATCH_SIZE = "32" # @param {type:"string"}


In [None]:
config_settings = {
    "BASEDIR" : BASEDIR,
    "DATADIR": DATADIR,
    "USE_ELEVATION": USE_ELEVATION,
    "USE_S1": USE_S1,
    "PATCH_SHAPE": PATCH_SHAPE,
    "BATCH_SIZE": BATCH_SIZE,
}


In [None]:
import dotenv

config_file = "servir-aces/config.env"

for config_key in config_settings:
    dotenv.set_key(dotenv_path=config_file,
                   key_to_set=config_key,
                   value_to_set=config_settings[config_key]
                   )


## Load config file variables

In [None]:
from aces import Config, DataProcessor

In [None]:
config_file = "/content/servir-aces/config.env"
config = Config(config_file, override=True)

BASEDIR: /content
DATADIR: /content/datasets/unet_256x256_planet_wo_indices
using features: ['red_before', 'green_before', 'blue_before', 'nir_before', 'red_during', 'green_during', 'blue_during', 'nir_during']
using labels: ['class']


Most of the config in the `config.env` is now available via the config instance. Let's check few of them here.

In [None]:
config.TRAINING_DIR, config.BATCH_SIZE, config.FEATURES

(PosixPath('/content/datasets/unet_256x256_planet_wo_indices/training'),
 32,
 ['red_before',
  'green_before',
  'blue_before',
  'nir_before',
  'red_during',
  'green_during',
  'blue_during',
  'nir_during'])

## Calculate the number of records

Use the `calculate_n_samples` static function of the `DataProcessor` class to get the number of records for each split. You can provide additional parameters (`PRINT_DATASET`) as well.

In [None]:
%%time
additional_config = {
    "PRINT_DATASET": True
}
n_training_records, n_testing_records, n_validation_records = DataProcessor.calculate_n_samples(**{**config.__dict__, **additional_config})


Training
inputs: float32 (256, 256, 8)
tf.Tensor(
[[[0.0533   0.060125 0.033475 ... 0.054225 0.02415  0.337825]
  [0.05275  0.059825 0.033125 ... 0.05405  0.022475 0.341325]
  [0.06195  0.06465  0.0364   ... 0.05005  0.02265  0.251175]
  ...
  [0.02165  0.022425 0.0111   ... 0.024725 0.00965  0.144825]
  [0.022525 0.023275 0.011775 ... 0.0275   0.010575 0.156375]
  [0.0181   0.025125 0.0115   ... 0.0257   0.00965  0.145225]]

 [[0.04455  0.054675 0.032125 ... 0.04935  0.02175  0.300325]
  [0.0435   0.054075 0.0313   ... 0.04645  0.0205   0.292875]
  [0.048825 0.057775 0.0327   ... 0.04545  0.020725 0.268675]
  ...
  [0.0221   0.024925 0.0136   ... 0.025375 0.0117   0.12375 ]
  [0.018125 0.0269   0.012975 ... 0.02125  0.009275 0.118775]
  [0.020975 0.023875 0.012175 ... 0.023875 0.0129   0.123425]]

 [[0.0453   0.05215  0.032375 ... 0.0502   0.02265  0.276275]
  [0.045575 0.05125  0.032075 ... 0.0467   0.022625 0.267225]
  [0.052175 0.055175 0.032475 ... 0.045325 0.021275 0.277025]
  ..

Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


CPU times: user 3min 32s, sys: 10.4 s, total: 3min 43s
Wall time: 3min 33s


In [None]:
print(f"no of training records: {n_training_records}")
print(f"no of testing records: {n_testing_records}")
print(f"no of validation records: {n_validation_records}")

no of training records: 8531
no of testing records: 1222
no of validation records: 2404
