In [1]:
# imports
import os
import numpy as np
import tensorflow as tf
from google.cloud import storage
import tempfile
# import cv2
# from drought_detection.params import BUCKET_NAME, BUCKET_TRAIN_DATA_PATH


## AVAILABLE BANDWIDTHS

In [2]:
# features = {
#   'B1': tf.FixedLenFeature([], tf.string),    # 0.43 - 0.45 μm Coastal aerosol
#   'B2': tf.FixedLenFeature([], tf.string),    # Blue
#   'B3': tf.FixedLenFeature([], tf.string),    # Green
#   'B4': tf.FixedLenFeature([], tf.string),    # Red
#   'B5': tf.FixedLenFeature([], tf.string),    # Near infrared
#   'B6': tf.FixedLenFeature([], tf.string),    # Shortwave infrared 1
#   'B7': tf.FixedLenFeature([], tf.string),    # Shortwave infrared 2
#   'B8': tf.FixedLenFeature([], tf.string),
#   'B9': tf.FixedLenFeature([], tf.string),
#   'B10': tf.FixedLenFeature([], tf.string),
#   'B11': tf.FixedLenFeature([], tf.string),
#   'label': tf.FixedLenFeature([], tf.int64),
# }

# Get and filter raw satellite data for specific spectral bands

In [4]:

def parse_visual(data):
    '''
    This function filters satellite image data by specific spectral bands (RGB in this case).
    The function loads a batch of satellite images from a list of files
    and parses the satellite image data files for some specific features,
    e.g. spectral bands (B2, B3, B4, see official documentation)

    Input(s): - list of satellite image files (including path, e.g '/data/train/part-r-00000')
    Outputs:  - list of dictionaries of raw satellite data (filtered by spectral band)
    '''
    dataset = tf.data.TFRecordDataset(data)

    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)

    features = {
        'B2': tf.compat.v1.FixedLenFeature([], tf.string), # blue spectral band
        'B3': tf.compat.v1.FixedLenFeature([], tf.string), # green spectral band
        'B4': tf.compat.v1.FixedLenFeature([], tf.string), # red spectral band
        'label': tf.compat.v1.FixedLenFeature([], tf.int64), # image label (0/1/2/3)
    }

    parsed_sat_imgs = [tf.compat.v1.parse_single_example(data, features) for data in iterator]
    return parsed_sat_imgs


# Transform each raw satellite image into an RGB array with label

In [5]:

def get_img_from_example(parsed_sat_img, intensify=True):
    '''
    This function creates an RGB 3D array in shape 65x65x3 (65x65 pixels). 
    Then it loads, for some bands / features (B4, B3, B2) a batch of satellite
    images, reshapes it from the Tensorflow specific data format into a 2D array of dimension 65x65 pixels.
    Next, it does some preprocessing, e.g. divides each pixel by the maximum value of the pixels, and
    multiplies every pixel by 255. Only if  intensity=True (please double check).
    Otherwise,, if intensify=False, it just multiplies each value in the matrix by 255.
    last, it adds a label (please double check), and returns the image 2D array, as well as the label.

    Input: - a parsed satellite image: Specific Tensorflow format (as dictionary)
    Output(s) - satellite image & its label:
                - rgbArray: tuple of 3D numpy array (shape 65x65x3)
                - label: int32
    '''
    rgbArray = np.zeros((65,65,3), 'uint8')
    for i, band in enumerate(['B4', 'B3', 'B2']):
        band_data = np.frombuffer(parsed_sat_img[band].numpy(), dtype=np.uint8)
        band_data = band_data.reshape(65, 65)
        if intensify:
            band_data = band_data/np.max(band_data)*255 # are we transforming the image from bytes to digital numbers by multiplying by 255?
        else:
            band_data = band_data*255
        rgbArray[..., i] = band_data

    label = tf.cast(parsed_sat_img['label'], tf.int32).numpy()

    return rgbArray, label


# Load images from GCP and process them

In [8]:
# # set bucket parameters
# BUCKET_NAME = 'wagon-data-batch913-drought_detection'
# BUCKET_TRAIN_DATA_PATH = 'data/train/'

## try to load blobs without for loop (simple way)

In [62]:
# simple way

# GCP bucket parameters
project_name='drought-detection'
bucket_name = 'wagon-data-batch913-drought_detection'

# open storage client
storage_client = storage.Client(project=project_name)

# create blobs from the files starting with "part"
blobs = storage_client.list_blobs(bucket_name, prefix='data/train/part', delimiter='/')

In [63]:
# print blob file names
for blob in blobs:
    print(blob.name)

data/train/part-r-00000
data/train/part-r-00001
data/train/part-r-00002
data/train/part-r-00003
data/train/part-r-00004
data/train/part-r-00005
data/train/part-r-00006
data/train/part-r-00007
data/train/part-r-00008
data/train/part-r-00009
data/train/part-r-00010
data/train/part-r-00011
data/train/part-r-00012
data/train/part-r-00013
data/train/part-r-00014
data/train/part-r-00015
data/train/part-r-00016
data/train/part-r-00017
data/train/part-r-00018
data/train/part-r-00019
data/train/part-r-00020
data/train/part-r-00021
data/train/part-r-00022
data/train/part-r-00023
data/train/part-r-00024
data/train/part-r-00025
data/train/part-r-00026
data/train/part-r-00027
data/train/part-r-00028
data/train/part-r-00029
data/train/part-r-00030
data/train/part-r-00031
data/train/part-r-00032
data/train/part-r-00033
data/train/part-r-00034
data/train/part-r-00035
data/train/part-r-00036
data/train/part-r-00037
data/train/part-r-00038
data/train/part-r-00039
data/train/part-r-00040
data/train/part-

In [61]:
images = []

for blob in blobs:
    # transform data format
    parsed_img = parse_visual(blob) # parse satellite image data
    img_sat = get_img_from_example(parsed_img[0]) # convert data to rgbArray with label
    
    # append image to list
    images.append(img_sat)

images

ValueError: ('Iterator has already started', <google.api_core.page_iterator.HTTPIterator object at 0x18f693e80>)

## load blobs in for loop

In [58]:

def get_images_gcp(n=10):
    '''
    This function gets images from the cloud in the correct format.
    The function downloads images into temporary files, does a transformation, and then deletes the temporary file.
    '''
    
    # GCP bucket parameters
    project_name = 'drought-detection'
    bucket_name = 'wagon-data-batch913-drought_detection'

    # open client and get blobs
    storage_client = storage.Client(project=project_name)
    blobs = storage_client.list_blobs(bucket_name, prefix='data/train/part', delimiter='/')
    
    images = []

    for blob in blobs:
        # # create temporary file
        _, temp_local_filename = tempfile.mkstemp()

        # Download file from bucket
        blob.download_to_filename(temp_local_filename)
        # temp_local_filename = blob.download_as_bytes() # tried downloading TFRecords in different format
        
        # Do stuff to file (transform data format)
        parsed_img = parse_visual(temp_local_filename) # parse satellite image data
        img_sat = get_img_from_example(parsed_img[0]) # convert data to rgbArray with label

        # append image to list
        images.append(img_sat)
        
        # # remove temporary file
        os.remove(temp_local_filename)
        
    return np.array(images)


In [59]:
# load images
images = get_images_gcp()

Forbidden: 403 GET https://storage.googleapis.com/download/storage/v1/b/wagon-data-batch913-drought_detection/o/data%2Ftrain%2Fpart-r-00000?generation=1657545621344585&alt=media: helyne@disco-genius-351114.iam.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object.: ('Request failed with status code', 403, 'Expected one of', <HTTPStatus.OK: 200>, <HTTPStatus.PARTIAL_CONTENT: 206>)

In [None]:
# check shape
images.shape