# Dataset_v0

Building first version of the dataset for deep learning

- Satellite: 
 - NIR
 - R
 - G
 - B
 - NDVI
 - SWIR
 - W
 - THETA (target)
- Dataframe
 - inSitu obs (ground truth obs)

## 1 - Python packages

In [1]:
# MODULE                                             # DESCRIPTION
                                      # bidirectional UTM-WGS84 converter
import rasterio as rs                                # raster and geospatial data proc.
import matplotlib.pyplot as plt                      # create visualizations
import datetime                                      # datetime manipulation
import glob                                          # unix pathname expansion
import colorcet as cc                                # colormaps for datashader 
import xarray as xr                                  # efficent ND arrays manipulation
import rioxarray                                     # rasterio xarray extension
import pickle                                        # load/save pickle datasets
import os                                            # miscellaneous OS interfaces
from pathlib import Path                             #                                      #
import shutil
import re
import glob

## 2 Satellite Data

### 2.1 - Copying raw files

In [9]:
BASE_DIR       = "./dataset_v0/"

BASE_DIR_BOA   = "./sen2r/out/BOA2"
BASE_DIR_SCL   = "./dataset_v0/SCL"
BASE_DIR_R     = "./dataset_v0/R"
BASE_DIR_G     = "./dataset_v0/G"
BASE_DIR_B     = "./dataset_v0/B"
BASE_DIR_NIR   = "./dataset_v0/NIR"


BASE_DIR_NDVI_old = "./sen2r/indices/NDVI"
BASE_DIR_NDVI  = "./dataset_v0/NDVI"

BASE_DIR_SWIR  = "./dataset_v0/SWIR"

BASE_DIR_W     = "./dataset_v0/W"
BASE_DIR_THETA = "./dataset_v0/THETA"

Path(BASE_DIR).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_R).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_G).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_B).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_NIR).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_NDVI).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_SWIR).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_W).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_THETA).mkdir(parents=True, exist_ok=True)
Path(BASE_DIR_SCL).mkdir(parents=True, exist_ok=True)

W,theta,ndvi bands are already computed as independent files. We only need to copy them to the new folder: 

In [3]:
! cp -r ./W_maps/* ./dataset_v0/W

In [4]:
! cp -r ./theta_maps/* ./dataset_v0/THETA/

In [5]:
! cp -r ./sen2r/out/SCL_res10/* ./dataset_v0/SCL/

In [6]:
w_files = [f for f in os.listdir(BASE_DIR_W) if os.path.isfile(os.path.join(BASE_DIR_W, f))]

for idx,file in enumerate(w_files): 
    
    try : 
        
        now = datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")
        print('[%s] : processing file (%s/%s) %s... \n' % (now, idx, len(w_files), file))
        
        w_fp = os.path.join(BASE_DIR_W, file)
        basename_fp = w_fp[len(BASE_DIR_W)+1:-5]
        
        
        # NDVI
        old_ndvi_fp = os.path.join(BASE_DIR_NDVI_old, basename_fp+"NDVI_10.tif")
        new_ndvi_fp = os.path.join(BASE_DIR_NDVI, basename_fp+"NDVI_10.tif")
        shutil.copyfile(old_ndvi_fp, new_ndvi_fp)
        
        # BOA - R,G,B,NIR
        boa_fp = os.path.join(BASE_DIR_BOA, basename_fp+"BOA_10.tif")
        boa    = rs.open(boa_fp)
        
        r    = boa.read(4)
        g    = boa.read(3)
        b    = boa.read(2)
        swir = boa.read(11)
        nir  = boa.read(8)
        
        out_meta = boa.meta.copy()
        out_meta["count"] = 1
        with rs.open(fp=os.path.join(BASE_DIR_R, basename_fp+"R_10.tif"), 
                     mode='w',**out_meta) as dst:
                     dst.write(r, 1)
                
        with rs.open(fp=os.path.join(BASE_DIR_G, basename_fp+"G_10.tif"), 
                     mode='w',**out_meta) as dst:
                     dst.write(g, 1) 

        with rs.open(fp=os.path.join(BASE_DIR_B, basename_fp+"B_10.tif"), 
                     mode='w',**out_meta) as dst:
                     dst.write(b, 1) 
                
        with rs.open(fp=os.path.join(BASE_DIR_NIR, basename_fp+"NIR_10.tif"), 
                     mode='w',**out_meta) as dst:
                     dst.write(nir, 1)

        with rs.open(fp=os.path.join(BASE_DIR_SWIR, basename_fp+"SWIR_10.tif"), 
                     mode='w',**out_meta) as dst:
                     dst.write(swir, 1)
                
        boa.close()
        
    except Exception as e : 
        print("An error occurred while processing data for sensor %s " % file)
        print("Original message: %s " % e)

[2022/05/06 12:16:29] : processing file (0/71) S2A2A_20191211_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:29] : processing file (1/71) S2A2A_20190614_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:30] : processing file (2/71) S2B2A_20190510_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:32] : processing file (3/71) S2A2A_20190823_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:33] : processing file (4/71) S2B2A_20190818_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:33] : processing file (5/71) S2A2A_20190704_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:35] : processing file (6/71) S2A2A_20191201_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:36] : processing file (7/71) S2B2A_20190907_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:37] : processing file (8/71) S2B2A_20191216_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:39] : processing file (9/71) S2B2A_20190110_041_Walnut-Gulch_W.tif... 

[2022/05/06 12:16:40] : processing file (10/71) S2B2A_20190311_041_Walnut-Gulch_W.tif... 

[2022/05/

### 2.2 - Masking all bands based on SCL

Not al bands need to be masked, i.e. W and theta are already masked.

In [7]:
band_dirs = [BASE_DIR_B,
             BASE_DIR_G,
             BASE_DIR_R,
             BASE_DIR_NIR,
             BASE_DIR_NDVI,
             BASE_DIR_SWIR]

def  get_date_str_S2_file(filename):
    m = re.match('S2(A|B)2A_([0-9]+)_',filename)
    date = m.group(2)
    return (date)

In [8]:
for band_dir in band_dirs : 
    print('processing band dir %s ... \n' % (band_dir))
    band_dir_masked = os.path.join(band_dir, "masked")
    Path(band_dir_masked).mkdir(parents=True, exist_ok=True)
    print(band_dir)
    band_files = [f for f in os.listdir(band_dir) if os.path.isfile(os.path.join(band_dir, f))]
    #print(band_files)
    
    for band_file in band_files : 
        try: 
            print('--- processing file %s ... \n' % (band_file))
            band_file_date = get_date_str_S2_file(band_file)
            scl_file = glob.glob(os.path.join(BASE_DIR_SCL,"*"+band_file_date+"*"))[0]
            print(scl_file)

            scl = rioxarray.open_rasterio(scl_file, masked=True)
            scl = scl[0]
            scl = scl.rename({'band':'scl'})

            band = rioxarray.open_rasterio(os.path.join(band_dir,band_file), masked=True)
            band = band[0]
            band = band.rename({'band':'band'})

            band_masked = band.where(
                                (scl == 4) | # vegetation
                                (scl == 5) | # not_vegetated
                                (scl == 6)   # water
                              )

            band_masked.rio.to_raster(os.path.join(band_dir_masked, band_file))
            
        except Exception as e :
            print("An error occurred while processing file %s " % band_file)
            print("Original message: %s " % e)

processing band dir ./dataset_v0/B ... 

./dataset_v0/B
--- processing file S2B2A_20190430_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20190430_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190510_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20190510_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190204_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20190204_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190724_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20190724_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20191012_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20191012_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20191017_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20191017_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190410_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20190410_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190219_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL

--- processing file S2B2A_20190530_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20190530_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190917_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20190917_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20191022_041_Walnut-Gulch_B_10.tif ... 

./dataset_v0/SCL/20191022_SCL_10m_resampled_by_gdal.tif
processing band dir ./dataset_v0/G ... 

./dataset_v0/G
--- processing file S2B2A_20190510_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20190510_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190828_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20190828_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190415_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20190415_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20191231_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20191231_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190410_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL

--- processing file S2B2A_20190729_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20190729_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190604_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20190604_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20191101_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20191101_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20191027_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20191027_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190425_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20190425_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20191002_041_Walnut-Gulch_G_10.tif ... 

./dataset_v0/SCL/20191002_SCL_10m_resampled_by_gdal.tif
processing band dir ./dataset_v0/R ... 

./dataset_v0/R
--- processing file S2A2A_20190614_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20190614_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190724_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL

--- processing file S2A2A_20190803_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20190803_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190405_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20190405_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190902_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20190902_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20191216_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20191216_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190818_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20190818_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20191126_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20191126_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190130_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20190130_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190105_041_Walnut-Gulch_R_10.tif ... 

./dataset_v0/SCL/20190105_SCL_10m_resampled_by_gdal.tif
processing band 

--- processing file S2B2A_20190420_041_Walnut-Gulch_NIR_10.tif ... 

./dataset_v0/SCL/20190420_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190214_041_Walnut-Gulch_NIR_10.tif ... 

./dataset_v0/SCL/20190214_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190410_041_Walnut-Gulch_NIR_10.tif ... 

./dataset_v0/SCL/20190410_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190331_041_Walnut-Gulch_NIR_10.tif ... 

./dataset_v0/SCL/20190331_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190209_041_Walnut-Gulch_NIR_10.tif ... 

./dataset_v0/SCL/20190209_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190917_041_Walnut-Gulch_NIR_10.tif ... 

./dataset_v0/SCL/20190917_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190907_041_Walnut-Gulch_NIR_10.tif ... 

./dataset_v0/SCL/20190907_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20191116_041_Walnut-Gulch_NIR_10.tif ... 

./dataset_v0/SCL/20191116_SCL_10m_resampled_by_gdal.tif


--- processing file S2B2A_20190629_041_Walnut-Gulch_NDVI_10.tif ... 

./dataset_v0/SCL/20190629_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190803_041_Walnut-Gulch_NDVI_10.tif ... 

./dataset_v0/SCL/20190803_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190604_041_Walnut-Gulch_NDVI_10.tif ... 

./dataset_v0/SCL/20190604_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190808_041_Walnut-Gulch_NDVI_10.tif ... 

./dataset_v0/SCL/20190808_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190525_041_Walnut-Gulch_NDVI_10.tif ... 

./dataset_v0/SCL/20190525_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190316_041_Walnut-Gulch_NDVI_10.tif ... 

./dataset_v0/SCL/20190316_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20191206_041_Walnut-Gulch_NDVI_10.tif ... 

./dataset_v0/SCL/20191206_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20191126_041_Walnut-Gulch_NDVI_10.tif ... 

./dataset_v0/SCL/20191126_SCL_10m_resampled_by_g

--- processing file S2B2A_20190828_041_Walnut-Gulch_SWIR_10.tif ... 

./dataset_v0/SCL/20190828_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190917_041_Walnut-Gulch_SWIR_10.tif ... 

./dataset_v0/SCL/20190917_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190614_041_Walnut-Gulch_SWIR_10.tif ... 

./dataset_v0/SCL/20190614_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190306_041_Walnut-Gulch_SWIR_10.tif ... 

./dataset_v0/SCL/20190306_SCL_10m_resampled_by_gdal.tif
--- processing file S2B2A_20190110_041_Walnut-Gulch_SWIR_10.tif ... 

./dataset_v0/SCL/20190110_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20191101_041_Walnut-Gulch_SWIR_10.tif ... 

./dataset_v0/SCL/20191101_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20191221_041_Walnut-Gulch_SWIR_10.tif ... 

./dataset_v0/SCL/20191221_SCL_10m_resampled_by_gdal.tif
--- processing file S2A2A_20190105_041_Walnut-Gulch_SWIR_10.tif ... 

./dataset_v0/SCL/20190105_SCL_10m_resampled_by_g

In [None]:
for B, G, R, NIR, NDVI, SWIR in dir: 
    
    for file in dir : 
        date_file = get_date_file(file)
        scl = getSCL_for_date
        
        scl = rioxarray.open_rasterio(scl_fp, masked=True)
        scl = scl[0]
        scl = scl.rename({'band':'scl',
                           'x':'lat',
                           'y':'long'})
        
        band = rioxarray.open_rasterio(file, masked=True)
        band = band[0]
        band = band.rename({'band':'scl',
                           'x':'lat',
                           'y':'long'})  
        
        band_masked = w.where(
                            (scl == 4) | # vegetation
                            (scl == 5) | # not_vegetated
                            (scl == 6)   # water
                          )
        
        band.rio.to_raster(os.path.join(dir, file))

## 3 - Dataframe data

In [10]:
! cp ./inSitu_dfs_global.pkl ./dataset_v0/

In [12]:
! cp ./inSitu_dfs_global_masked.pkl ./dataset_v0/

## 4 - Quality checks

In [16]:
band_dirs = [
 BASE_DIR_SCL,   
 BASE_DIR_R,    
 BASE_DIR_G,   
 BASE_DIR_B,
 BASE_DIR_NIR,  
 BASE_DIR_NDVI,
 BASE_DIR_SWIR ,
 BASE_DIR_W , 
 BASE_DIR_THETA
]

map()

