# Local / Sagemaker Imports

In [1]:
!pip install rasterio geopandas shapely tensorflow-addons[tensorflow]

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [1]:
import os
import rasterio as rio
import numpy as np
from rasterio.windows import Window
from glob import glob
from shapely.geometry import Polygon
from shapely.geometry import box
import geopandas as gpd
from rasterio.windows import get_data_window
import rasterio as rio
from inference_predict import *
import boto3
import matplotlib.pyplot as plt
# import gdal

# Windowing

Define windows code to run inference on 1km x 1km areas (or however big the chips you trained on were) without having to actually chip out the rasters

In [2]:
def get_windows(img_dim, patch_size=(240, 240), stride=(240, 240)):
    patch_size = np.array(patch_size)
    stride = np.array(stride)
    img_dim = np.array(img_dim)
    # to take into account edges, add additional blocks around right side edge and bottom edge of raster
    new_img_dim = [img_dim[0] + stride[0],img_dim[1] + stride[0]]
    
    max_dim = (new_img_dim//patch_size)*patch_size - patch_size

    ys = np.arange(0, img_dim[0], stride[0])
    xs = np.arange(0, img_dim[1], stride[1])

    tlc = np.array(np.meshgrid(ys, xs)).T.reshape(-1, 2)
    tlc = tlc[tlc[:, 0] <= max_dim[0]]
    tlc = tlc[tlc[:, 1] <= max_dim[1]]
    
    windows = []
    for y,x in tlc.astype(int):
        windows.append(Window(x, y, patch_size[1], patch_size[0]))

    return windows

In [3]:
def add_ndvi(data, dtype_1=rio.float32):
    
    nir = data[3].astype(dtype_1)
    red = data[2].astype(dtype_1)

    # Allow division by zero
    np.seterr(divide='ignore', invalid='ignore')

    # Calculate NDVI
    ndvi = ((nir - red) / (nir + red)).astype(dtype_1)

    # Rescaling for use in 16bit output

    ndvi = (ndvi + 1) * (2**15 - 1)

    # Add NDVI band to end of array    
    rast = np.concatenate((data,[ndvi]),axis=0)
    
    rast = rast.astype(rio.uint16)
    
    return rast


# Download Model Files

In [5]:
# change URLs to download different models
model_url = "s3://canopy-production-ml/inference/model_files/model-best.h5"
weights_url = "s3://canopy-production-ml/inference/model_files/model_weights_best.h5"

download_model(model_url,weights_url)

In [4]:
model = load_model("model.h5","model_weights.h5") 

2022-09-03 03:16:28.681102: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-09-03 03:16:28.681181: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-172-16-23-61.ec2.internal): /proc/driver/nvidia/version does not exist
2022-09-03 03:16:28.681974: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Set parameters

In [5]:
# change label_list if you have different labels
label_list = ["Industrial_agriculture","ISL","Mining","Roads","Shifting_cultivation"]

In [6]:
import boto3

# create list of rasters to run inference on

s3 = boto3.resource('s3')

bucket_name = 'canopy-production-ml'

pc_bucket = s3.Bucket(bucket_name)

prefix = 'full_drc/2019/' # change to whichever directory your rasters are in

rasters_list_2019 = []

for obj in pc_bucket.objects.all():
    #if train_uri in obj.key:
    #    train_chips.append(obj.key)
    if prefix in obj.key:
        rasters_list_2019.append(obj.key)
    
len(rasters_list_2019)

3033

# Run inference

In [7]:
def output_windows(granule_dir,patch_size=100,
                   stride=100,SAVE=False,
                   bands=[1, 2, 3, 4, 5, 6], 
                  model=model,
                   predict_thresh=.5,
                  label_list=label_list, 
                    current_output={}):
    
    """
    Runs the model on the rasters in granule_dir and outputs dictionary containing predictions
    Arguments:
    granule_dir: Directory containing granules you want to run inference on
    patch_size: Size of the window you want to run the model on
    stride: After running inference on one window, how far to move before defining the next window.
    Should be same as "patch-size" unless you have a very good reasn for it not to be.
    bands: Which bands to use for the inference
    model: tensorflow model to use
    predict_thresh: Probability threshold to assign a label
    label_list: List of labels
    current_output: if this function is being run concurrently, put your most recent
    output_dict into this argument.
    """
    
    granule_list = glob(f'{granule_dir}/*.tif')
    
    output_dict = current_output.copy()
    
    granule_id_list = []
    
    window_id_list = []
    
    window_geom_list = []
    
    data_list = []
    
    label_master_list = []
    
    gdf_list = []
    
    timestamp = gen_timestamp()
    
    for j,granule_path in enumerate(granule_list):
        
        granule_id = granule_path.split("/")[-1].split(".")[0]
    
        with rio.open(granule_path) as src:

            windows = get_windows(src.shape, (patch_size, patch_size), (stride, stride))

            for i, window in enumerate(windows):
                
                #print(f"predicting window {i + 1} of {len(windows)} of granulate {j + 1} of {len(granule_list)}",end='\r', flush=True)
                
                label_name_list = []
                
                window_id = i+1

                data = src.read(bands,window=window, masked=True)

                data = add_ndvi(data)
        
                shape = data.shape
            
                new_shape = (data.shape[0],patch_size,patch_size)
            
                if shape != new_shape:

                    filled_array = np.full(new_shape, 0)
                    filled_array[:shape[0],:shape[1],:shape[2]] = data
                    data = filled_array
                    window = Window(window.col_off,window.row_off,shape[2],shape[1])
                    
                    
                #image pre-processing / inference
                prediction = model.predict(read_image_tf_out(data))
                prediction = np.where(prediction > predict_thresh, 1, 0)
                prediction_i = np.where(prediction == 1)[1]
                for i in prediction_i:
                    label_name_list.append(label_list[i])
                
                label_master_list.append(label_name_list)
                
                #vectorizing raster bounds for visualization 
                window_bounds = rio.windows.bounds(window, src.transform, height=patch_size, width=patch_size)
                geom = box(*window_bounds)
                geom_coords = list(geom.exterior.coords)
#                 window_geom_list.append(geom)
                
                #create or append to dict....
                
                if granule_id in output_dict:

                    output_dict[granule_id].append({"window_id":window_id,"polygon_coords":geom_coords,"labels":label_name_list})

                else:

                    output_dict[granule_id] = [{"window_id":window_id,"polygon_coords":geom_coords,"labels":label_name_list}]
        
        #save_to_s3(output_dict,output_filename,job_name,timestamp)
        


#             gdf = gpd.GeoDataFrame({"granule_id":granule_id_list,"window_id":window_id_list,"geometry":window_geom_list,"labels":label_master_list})
#             gdf["labels"] = gdf["labels"].astype(str)

#             gdf_list.append(gdf)
            
    return output_dict

In [8]:
s3 = boto3.client('s3')

import os

Below, I download rasters from S3 one at a time, run inference on it, save the results in a json file, then delete the raster. This is to ensure only one raster file is in storage at any given time.

Note the distinction between "old_json_name" and "new_json_name." Each time I run inference on a new raster, I save the results in a *new* json file, and only after that's done do I delete the old json file. This is to ensure that most data is not lost if the notebook is interrupted while writing the new json file.

In [9]:
granule_dir = './rasters_isl_2019/' # directory to store downloaded rasters in
json_filename_base = "ISL_2019_results_fixed" # name for the json file used to log results
new_json_name = json_filename_base + '_2019_549.json' # most recent json filename
with open(new_json_name, 'r') as openfile:
    od = json.load(openfile)
length = len(rasters_list_2019) - 1

for i, s3_path in enumerate(rasters_list_2019[1:]):
    
    name = s3_path.split('/')[-1].split('.')[0]
    
    if name not in od.keys(): # only run inference if the raster isn't in the json file
    
        print(f'Raster {i+1} of {length}', end='\r', flush=True)

        filename = s3_path.split('/')[-1]

        filepath = granule_dir + filename

        s3.download_file(bucket_name, s3_path, filepath) # download raster from S3
    
        output_dict = output_windows_2(granule_dir, current_output=od) # run inference
        
        old_json_name = new_json_name
        
        new_json_name = json_filename_base + '_' + name + '.json' # create new json filename
    
        with open(new_json_name, "w") as outfile:
            json.dump(output_dict, outfile) # dump inference results into new json file
    
        with open(new_json_name, 'r') as openfile:
            od = json.load(openfile) # load those results into "od" variable
        
        os.remove(filepath) # delete raster file
        
        os.remove(old_json_name) # delete old json file

Raster 3032 of 3032

In [166]:
# count number of each label
data = output_dict

count = {}
label_match_results = []
granule_count = len(data.keys())
granule_list = data.keys()
count["granule_count"] = granule_count
for k1 in list(data.keys()):
    for i in range(len(data[k1])):
        if len(data[k1][i]['labels']) == 0:
            if "null_chips" not in count.keys():
                count["null_chips"] = 1
            else:
                count["null_chips"] += 1 
        for label in data[k1][i]['labels']:
            if label not in count.keys():
                count[label] = 1 
            else:
                    count[label] += 1 
        

In [167]:
count

{'granule_count': 1,
 'null_chips': 1512,
 'Shifting_cultivation': 336,
 'ISL': 82,
 'Roads': 5,
 'Industrial_agriculture': 1}