In [1]:
import datacube
import numpy as np
from datacube.storage import netcdf_writer
from datacube.model import Variable, CRS
import os
import re
import xarray as xr

# Parámetros


In [2]:
execID="PCA"
algorithm = "PCA"
version= 1
min_long = -74
min_lat = 0
time_ranges = [("2014-01-01", "2014-12-31"), ("2015-01-01", "2015-12-31")]
product = 'LS8_OLI_LEDAPS'
bands = ["blue", "green", "red", "nir", "swir1", "swir2"]
normalized=True
minValid=1

# Consulta

In [3]:
dc = datacube.Datacube(app="PCA")
xarr={}
i=0
for tr in time_ranges:
    xarr[i] = dc.load(product=product, longitude=(min_long, min_long+1.0), latitude=(min_lat, min_lat+1), time=tr)
    i+=1

In [4]:
xarr

{0: <xarray.Dataset>
 Dimensions:          (latitude: 3686, longitude: 3705, time: 22)
 Coordinates:
   * time             (time) datetime64[ns] 2014-01-10T15:08:36 ...
   * latitude         (latitude) float64 0.9999 0.9996 0.9993 0.9991 0.9988 ...
   * longitude        (longitude) float64 -74.0 -74.0 -74.0 -74.0 -74.0 -74.0 ...
 Data variables:
     coastal_aerosol  (time, latitude, longitude) int16 100 101 100 101 100 ...
     blue             (time, latitude, longitude) int16 136 134 137 131 136 ...
     green            (time, latitude, longitude) int16 301 320 301 300 304 ...
     red              (time, latitude, longitude) int16 177 184 176 179 179 ...
     nir              (time, latitude, longitude) int16 3066 3235 3068 2989 ...
     swir1            (time, latitude, longitude) int16 1503 1451 1415 1365 ...
     swir2            (time, latitude, longitude) int16 552 539 529 507 521 ...
     cf_mask          (time, latitude, longitude) int16 0 0 0 0 0 0 0 0 0 0 0 ...
     cf_ma

In [5]:
#Sólo para mantener los nombres que estarán en el algoritmo:
xarr0=xarr[0]
xarr1=xarr[1]
del xarr

# Algoritmo

In [6]:
#Calcular el compuesto de medianas para cada uno de las entradas
nbar = xarr0
nodata=-9999
medians1={}
cloud_mask=np.where(np.logical_and(nbar["cf_mask"].values!=2, nbar["cf_mask"].values<4), True, False)
for band in bands:
    datos=np.where(np.logical_and(nbar.data_vars[band]!=nodata,cloud_mask),nbar.data_vars[band], np.nan)
    allNan=~np.isnan(datos)
    if normalized:
        m=np.nanmean(datos.reshape((datos.shape[0],-1)), axis=1)
        st=np.nanstd(datos.reshape((datos.shape[0],-1)), axis=1)
        datos=np.true_divide((datos-m[:,np.newaxis,np.newaxis]), st[:,np.newaxis,np.newaxis])*np.nanmean(st)+np.nanmean(m)
    medians1[band]=np.nanmedian(datos,0)
    medians1[band][np.sum(allNan,0)<minValid]=np.nan
del datos
nbar = xarr1
nodata=-9999
medians2={}
cloud_mask=np.where(np.logical_and(nbar["cf_mask"].values!=2, nbar["cf_mask"].values<4), True, False)
for band in bands:
    datos=np.where(np.logical_and(nbar.data_vars[band]!=nodata,cloud_mask),nbar.data_vars[band], np.nan)
    allNan=~np.isnan(datos)
    if normalized:
        m=np.nanmean(datos.reshape((datos.shape[0],-1)), axis=1)
        st=np.nanstd(datos.reshape((datos.shape[0],-1)), axis=1)
        datos=np.true_divide((datos-m[:,np.newaxis,np.newaxis]), st[:,np.newaxis,np.newaxis])*np.nanmean(st)+np.nanmean(m)
    medians2[band]=np.nanmedian(datos,0)
    medians2[band][np.sum(allNan,0)<minValid]=np.nan
del datos



In [7]:
from matplotlib.mlab import PCA
from sklearn.preprocessing import normalize
from scipy.cluster.vq import kmeans2,vq

In [8]:
#Preprocesar: 
nmed=None
nan_mask=None
for band in medians1:
    b=medians1[band].ravel()
    if nan_mask is None: 
        nan_mask=np.isnan(b)
    else:
        nan_mask=np.logical_or(nan_mask, np.isnan(medians1[band].ravel()))
    b[np.isnan(b)]=np.nanmedian(b)
    if nmed is None:
        sp=medians1[band].shape
        nmed=b
    else:
        nmed=np.vstack((nmed,b))
    c=medians2[band].ravel()
    nan_mask=np.logical_or(nan_mask, np.isnan(c))
    c[np.isnan(c)]=np.nanmedian(c)
    nmed=np.vstack((nmed,c))
del medians1
del medians2

In [9]:
r_PCA=PCA(nmed.T)

In [10]:
salida= r_PCA.Y.T.reshape((r_PCA.Y.T.shape[0],)+sp)
km_centroids, kmvalues=kmeans2(r_PCA.Y,4)
salida[:,nan_mask.reshape(sp)]=np.nan


In [12]:
#kmeans
kmv= kmvalues.T.reshape(sp)
kmv[nan_mask.reshape(sp)]=nodata

In [19]:
kmv

array([[    2,     2,     2, ...,     2,     2,     2],
       [    2,     2,     2, ...,     1,     2,     2],
       [    2,     2,     2, ...,     1,     1,     2],
       ..., 
       [-9999, -9999, -9999, ..., -9999, -9999, -9999],
       [-9999, -9999, -9999, ..., -9999, -9999, -9999],
       [-9999, -9999, -9999, ..., -9999, -9999, -9999]], dtype=int32)

In [13]:
coordenadas = []
dimensiones =[]
xcords = {}
for coordenada in xarr0.coords:
    if(coordenada != 'time'):
        coordenadas.append( ( coordenada, xarr0.coords[coordenada]) )
        dimensiones.append(coordenada)
        xcords[coordenada] = xarr0.coords[coordenada]
valores = {"kmeans": xr.DataArray(kmv, dims=dimensiones, coords=coordenadas)}
i=1
for x in salida:
    valores["pc"+str(i)]=xr.DataArray(x, dims=dimensiones, coords=coordenadas)
    i+=1
output = xr.Dataset(valores, attrs={'crs': xarr0.crs})
for coordenada in output.coords:
    output.coords[coordenada].attrs["units"] = xarr0.coords[coordenada].units

In [14]:
output

<xarray.Dataset>
Dimensions:    (latitude: 3686, longitude: 3705)
Coordinates:
  * latitude   (latitude) float64 0.9999 0.9996 0.9993 0.9991 0.9988 0.9985 ...
  * longitude  (longitude) float64 -74.0 -74.0 -74.0 -74.0 -74.0 -74.0 -74.0 ...
Data variables:
    pc8        (latitude, longitude) float64 0.6087 0.765 0.3897 -0.03688 ...
    pc9        (latitude, longitude) float64 -0.1597 -0.1363 -0.0779 0.01788 ...
    pc2        (latitude, longitude) float64 2.613 2.392 1.604 0.486 1.063 ...
    pc3        (latitude, longitude) float64 1.289 1.612 0.5752 1.385 2.455 ...
    pc1        (latitude, longitude) float64 1.831 1.574 -0.01468 -0.5085 ...
    pc6        (latitude, longitude) float64 -0.05934 0.3248 -0.1954 ...
    kmeans     (latitude, longitude) int32 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ...
    pc4        (latitude, longitude) float64 -0.3019 0.0927 0.3629 0.8255 ...
    pc5        (latitude, longitude) float64 -0.8287 -0.2217 -1.074 -1.292 ...
    pc11       (latitude, longitude) 

# Guardar

In [18]:
folder="./"
filename=folder+"PCA_{}_{}_{}.nc".format(min_lat,min_long,re.sub('[^\w_.)(-]', '', str(time_ranges)))
nco=netcdf_writer.create_netcdf(filename)
coords=output.coords
cnames=()
for x in coords:
    netcdf_writer.create_coordinate(nco, x, coords[x].values, coords[x].units)
    cnames=cnames+(x,)
netcdf_writer.create_grid_mapping_variable(nco, output.crs)
for band in output.data_vars:
    output.data_vars[band].values[np.isnan(output.data_vars[band].values)]=nodata
    var= netcdf_writer.create_variable(nco, band, Variable(output.data_vars[band].dtype, nodata, cnames, None) ,set_crs=True)
    var[:] = netcdf_writer.netcdfy_data(output.data_vars[band].values)
nco.close()