# Real-time analysis

In [None]:
import numpy as np
import xarray as xr
from datetime import datetime
from xhistogram.xarray import histogram
from rasterio import features
import rioxarray
import matplotlib.pyplot as plt
from affine import Affine
from ast import literal_eval
from shapely.geometry import Polygon
import _pickle as pickle

## Utils
**plot_hist**

In [None]:
def plot_hist(x_min, count):
    width = x_min[1]-x_min[0]
    width -= width/5.
    x_min += width/(5.*2)
    per = count/count.sum()*100
    
    plt.figure(figsize=(10,5))
    
    plt.bar(x_min, per, width=width)
    
    plt.plot([0,0], [0,per.max()], color = 'k', linestyle = '--')
    
    plt.title('Soil Organic Carbon Stock')
    plt.xlabel('SOC stock t C/ha)')
    plt.ylabel('(%) of total area')

**transform_from_latlon**

In [None]:
def transform_from_latlon(lat, lon):
    lat = np.asarray(lat)
    lon = np.asarray(lon)
    trans = Affine.translation(lon[0], lat[0])
    scale = Affine.scale(lon[1] - lon[0], lat[1] - lat[0])
    return trans * scale

**rasterize**

In [None]:
def rasterize(shapes, coords, latitude='latitude', longitude='longitude',
              fill=np.nan, **kwargs):
    """Rasterize a list of (geometry, fill_value) tuples onto the given
    xray coordinates. This only works for 1d latitude and longitude
    arrays.
    """
    transform = transform_from_latlon(coords[latitude], coords[longitude])
    out_shape = (len(coords[latitude]), len(coords[longitude]))
    raster = features.rasterize(shapes, out_shape=out_shape,
                                fill=fill, transform=transform,
                                dtype=float, **kwargs)
    spatial_coords = {latitude: coords[latitude], longitude: coords[longitude]}
    return xr.DataArray(raster, coords=spatial_coords, dims=(latitude, longitude))

## Read `xarray.Dataset` from `Zarr` in Amazon S3 bucket

In [None]:
dataset_type = 'experimental-dataset'
group = 'stocks'

with open(f'../data/{dataset_type}_{group}.pkl', 'rb') as input:
    ds = pickle.load(input)
    
ds

## Zonal statistics

**Polygon**

In [None]:
polygon = {
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {},
      "geometry": {
        "type": "Polygon",
        "coordinates": [
          [
            [
              -63.34716796874999,
              -34.234512362369856
            ],
            [
              -64.22607421875,
              -35.17380831799957
            ],
            [
              -63.896484375,
              -35.78217070326606
            ],
            [
              -63.34716796874999,
              -35.88905007936092
            ],
            [
              -62.86376953124999,
              -35.46066995149529
            ],
            [
              -62.51220703125,
              -35.08395557927643
            ],
            [
              -62.49023437499999,
              -34.57895241036947
            ],
            [
              -63.34716796874999,
              -34.234512362369856
            ]
          ]
        ]
      }
    }
  ]
}

geometry = Polygon(polygon.get('features')[0].get('geometry').get('coordinates')[0])
geometry

**Create the data mask by rasterizing the vector data**

In [None]:
%%time
shapes = zip([geometry], range(1))
da_mask = rasterize(shapes, ds.coords, longitude='lon', latitude='lat').rename('mask')
ds['mask'] = da_mask

## Change

**Input variables**

In [None]:
years = ['1982', '2017']
depth = '0-30'
nBinds=80
bindsRange=[-50, 50]

**Computation**

In [None]:
%%time
start_date = np.datetime64(datetime.strptime(f'{years[0]}-12-31', "%Y-%m-%d"))
end_date = np.datetime64(datetime.strptime(f'{years[1]}-12-31', "%Y-%m-%d"))

xmin, ymax, xmax, ymin = geometry.bounds
xds_index = ds.where(ds['mask'].isin(0.0)).sel(lon=slice(xmin, xmax), lat=slice(ymin, ymax))

# Get difference between two dates
diff = xds_index.loc[dict(time=end_date, depth=depth)] - xds_index.loc[dict(time=start_date, depth=depth)]

# Get counts and binds of the histogram
bins = np.linspace(bindsRange[0], bindsRange[1], nBinds+1)
h = histogram(diff.stocks, bins=[bins], dim=['lat', 'lon'])

count = h.values
mean_diff = diff['stocks'].mean(skipna=True).values 

**Output values**

In [None]:
print(f'Soil Organic Carbon Stock Change: {mean_diff/(int(years[1])-int(years[0]))} t C/ha year')
x_min = bins[:-1]
plot_hist(x_min, count)

## Time series

**Computation**

In [None]:
%%time
years = [int(str(x).split('-')[0]) for x in ds.coords.get('time').values]

xmin, ymax, xmax, ymin = geometry.bounds
xds_index = ds.where(ds['mask'].isin(0.0)).sel(depth='0-30', lon=slice(xmin, xmax), lat=slice(ymin, ymax))

values = xds_index['stocks'].mean(['lon', 'lat']).values

**Output values**

In [None]:
plt.plot(years, values)

## Cloud function
**`main.py` file**

In [10]:
import numpy as np
import xarray as xr
from xhistogram.xarray import histogram
from datetime import datetime
from affine import Affine
from rasterio import features
from shapely.geometry import Polygon
import _pickle as pickle
import json

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)
        
def transform_from_latlon(lat, lon):
    lat = np.asarray(lat)
    lon = np.asarray(lon)
    trans = Affine.translation(lon[0], lat[0])
    scale = Affine.scale(lon[1] - lon[0], lat[1] - lat[0])
    return trans * scale

def rasterize(shapes, coords, latitude='latitude', longitude='longitude',
              fill=np.nan, **kwargs):
    """Rasterize a list of (geometry, fill_value) tuples onto the given
    xray coordinates. This only works for 1d latitude and longitude
    arrays.
    """
    transform = transform_from_latlon(coords[latitude], coords[longitude])
    out_shape = (len(coords[latitude]), len(coords[longitude]))
    raster = features.rasterize(shapes, out_shape=out_shape,
                                fill=fill, transform=transform,
                                dtype=float, **kwargs)
    spatial_coords = {latitude: coords[latitude], longitude: coords[longitude]}
    return xr.DataArray(raster, coords=spatial_coords, dims=(latitude, longitude))

def compute_values(ds, geometry, years, depth, variable, dataset_type, group, nBinds, bindsRange):
    
    if dataset_type == 'global-dataset' and group == 'historic':
        start_date = years[0]
        end_date = years[1]
        mean_years = ds.coords.get('time').values
    else:
        start_date = np.datetime64(datetime.strptime(f'{years[0]}-12-31', "%Y-%m-%d"))
        end_date = np.datetime64(datetime.strptime(f'{years[1]}-12-31', "%Y-%m-%d"))
        mean_years = [int(str(x).split('-')[0]) for x in ds.coords.get('time').values]
    
    xmin, ymax, xmax, ymin = geometry.bounds
    ds_index = ds.where(ds['mask'].isin(0.0)).sel(depth='0-30', lon=slice(xmin, xmax), lat=slice(ymin, ymax))

    # Get difference between two dates
    diff = ds_index.loc[dict(time=end_date)] - ds_index.loc[dict(time=start_date)]
                    
    # Get counts and binds of the histogram
    if dataset_type == 'experimental-dataset' and variable == 'concentration':
        diff = diff[variable]/10.
    else:
        diff = diff[variable]

    bins = np.linspace(bindsRange[0], bindsRange[1], nBinds+1)
    h = histogram(diff, bins=[bins], dim=['lat', 'lon'])

    counts = h.values
    mean_diff = diff.mean(skipna=True).values 
    mean_values = ds_index[variable].mean(['lon', 'lat']).values
        
    return counts, bins, mean_diff, mean_years, mean_values

def serializer(counts, bins, mean_diff, mean_years, mean_values):

    return {
        'counts': counts,
        'bins': bins,
        'mean_diff': mean_diff,
        'mean_years': mean_years,
        'mean_values':mean_values
    }

def analysis(request):
    #request = request.get_json()
    
    # Read xarray.Dataset from pkl
    dataset_type = request['dataset_type']
    group = request['group']

    with open(f'../data/{dataset_type}_{group}.pkl', 'rb') as input:
        ds = pickle.load(input)
    
    # Create the data mask by rasterizing the vector data
    geometry = Polygon(request['geometry'].get('features')[0].get('geometry').get('coordinates')[0])
    
    shapes = zip([geometry], range(1))
    da_mask = rasterize(shapes, ds.coords, longitude='lon', latitude='lat').rename('mask')
    ds['mask'] = da_mask   
    
    # Compute output values
    counts, bins, mean_diff, mean_years, mean_values = compute_values(ds, geometry, request['years'], request['depth'], 
                                                                      request['variable'], request['dataset_type'], 
                                                                      request['group'], request['nBinds'], 
                                                                      request['bindsRange'])
    
    return json.dumps(serializer(counts, bins, mean_diff, mean_years, mean_values), cls=NpEncoder)

In [11]:
payload = {
    "dataset_type": 'experimental-dataset',
    "group": 'stocks',
    "years": ['1982', '2017'],
    "depth": '0-30',
    "variable": 'stocks',
    "nBinds": 80,
    "bindsRange": [-50, 50],
    "geometry": {
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {},
      "geometry": {
        "type": "Polygon",
        "coordinates": [
          [
            [
              -63.34716796874999,
              -34.234512362369856
            ],
            [
              -64.22607421875,
              -35.17380831799957
            ],
            [
              -63.896484375,
              -35.78217070326606
            ],
            [
              -63.34716796874999,
              -35.88905007936092
            ],
            [
              -62.86376953124999,
              -35.46066995149529
            ],
            [
              -62.51220703125,
              -35.08395557927643
            ],
            [
              -62.49023437499999,
              -34.57895241036947
            ],
            [
              -63.34716796874999,
              -34.234512362369856
            ]
          ]
        ]
      }
    }
  ]
}
}

In [12]:
analysis(payload)

'{"counts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 40, 159, 349, 533, 454, 718, 1308, 1990, 1773, 3093, 4032, 5546, 5396, 7383, 10383, 10389, 14917, 16806, 19075, 22001, 22555, 22207, 20030, 19504, 19400, 15006, 13093, 10084, 8035, 8436, 5813, 4374, 1730, 2262, 727, 1212, 566, 845, 670, 562, 142, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "bins": [-50.0, -48.75, -47.5, -46.25, -45.0, -43.75, -42.5, -41.25, -40.0, -38.75, -37.5, -36.25, -35.0, -33.75, -32.5, -31.25, -30.0, -28.75, -27.5, -26.25, -25.0, -23.75, -22.5, -21.25, -20.0, -18.75, -17.5, -16.25, -15.0, -13.75, -12.5, -11.25, -10.0, -8.75, -7.5, -6.25, -5.0, -3.75, -2.5, -1.25, 0.0, 1.25, 2.5, 3.75, 5.0, 6.25, 7.5, 8.75, 10.0, 11.25, 12.5, 13.75, 15.0, 16.25, 17.5, 18.75, 20.0, 21.25, 22.5, 23.75, 25.0, 26.25, 27.5, 28.75, 30.0, 31.25, 32.5, 33.75, 35.0, 36.25, 37.5, 38.75, 40.0, 41.25, 42.5, 43.75, 45.0, 46.25, 47.5, 48.75, 50.0], "mean_diff": -8.592864990234375, "mean_years": [1982, 1983, 1