In [1]:
import pw
import json
import glob
import ccd
import hashlib
import os
import numpy as np
import xarray as xr
import pandas as pd
from datetime import datetime

from pyspark import SparkConf, SparkContext

In [3]:
# these need to be set to python3
os.getenv('PYSPARK_DRIVER_PYTHON') == os.getenv('PYSPARK_PYTHON')

True

In [4]:
conf = (SparkConf().setAppName("lcmap-gen-{}".format(datetime.now().strftime('%Y-%m-%d-%I:%M')))\
        .setMaster(pw.LPW_MESOS_MASTER)\
        .set("spark.mesos.executor.docker.image", pw.LPW_EXECUTOR_IMAGE)\
        .set("spark.executor.cores", pw.LPW_EXECUTOR_CORES)\
        .set("spark.mesos.executor.docker.forcePullImage", pw.LPW_EXECUTOR_FORCE_PULL_IMAGE))

In [5]:
sc = SparkContext(conf=conf)

In [6]:
good_input_data = {'algorithm': 'lcmap-pyccd:1.0.3.b1',
                   'tile_update_requested': '2017-02-28T04:35:46.023Z',
                   'tile_x': -1821585,
                   'tile_y': 2891595,
                   'inputs_url': 'http://< !!  you define !! >/landsat/tiles?x=-1851585&y=2870805&acquired=1980-01-01/2015-12-31&ubid=LANDSAT_4/TM/cfmask&ubid=LANDSAT_4/TM/sr_band1&ubid=LANDSAT_4/TM/sr_band2&ubid=LANDSAT_4/TM/sr_band3&ubid=LANDSAT_4/TM/sr_band4&ubid=LANDSAT_4/TM/sr_band5&ubid=LANDSAT_4/TM/sr_band7&ubid=LANDSAT_4/TM/toa_band6&ubid=LANDSAT_5/TM/cfmask&ubid=LANDSAT_5/TM/sr_band1&ubid=LANDSAT_5/TM/sr_band2&ubid=LANDSAT_5/TM/sr_band3&ubid=LANDSAT_5/TM/sr_band4&ubid=LANDSAT_5/TM/sr_band5&ubid=LANDSAT_5/TM/sr_band7&ubid=LANDSAT_5/TM/toa_band6&ubid=LANDSAT_7/ETM/cfmask&ubid=LANDSAT_7/ETM/sr_band1&ubid=LANDSAT_7/ETM/sr_band2&ubid=LANDSAT_7/ETM/sr_band3&ubid=LANDSAT_7/ETM/sr_band4&ubid=LANDSAT_7/ETM/sr_band5&ubid=LANDSAT_7/ETM/sr_band7&ubid=LANDSAT_7/ETM/toa_band6'}

In [11]:
# at this point, you could take 'good_input_data' and call pw.worker.spark_job(good_input_data)
# that would assemble the data, format it for pyccd, and create a spark job to generate results
# it would then try to save them off to the cassandra instance you have configured in the environment
# settings referenced by the pw module when its loaded.
data = pw.worker.assemble_data(good_input_data)

In [12]:
# we should have data for 10k pixels
len(data['data']) == 10000

True

In [13]:
# We dont need to operate on all 10k pixels
short_data = data['data'][:10]
ccd_rdd = sc.parallelize(short_data, pw.LPW_SPARK_PARALLELIZATION)

In [14]:
# map and collect pyccd results for our subset of data
# the first element of x is a tuple of the x and y coordinates for that pixel stack
x = ccd_rdd.map(lambda x: detect(x, x[0][0], x[0][1])).collect()

In [15]:
x[0]

{'algorithm': 'lcmap-pyccd:1.3.0',
 'inputs_md5': 'not implemented',
 'result': '{"processing_mask": [false, false, false, false, false, true, false, false, false, false, true, true, true, true, true, true, false, false, false, false, false, false, true, true, false, false, false, true, false, true, true, true, true, true, true, true, true, true, true, false, false, true, false, false, true, false, false, false, false, false, true, true, false, true, true, true, true, true, true, true, true, true, true, false, true, true, true, true, true, false, false, false, true, true, true, false, false, false, true, false, false, false, false, true, false, false, false, false, true, true, true, false, false, true, true, true, true, true, true, true, false, false, true, true, true, true, true, true, false, false, true, true, false, true, false, true, true, false, true, true, true, true, true, true, true, false, true, true, false, true, true, false, true, false, false, false, true, false, false, fal

In [7]:
# super simple rdd and spark job test, help ensure our spark jobs can get placed
#import random

#def inside(p):
#    x, y = random.random(), random.random()
#    return x*x + y*y < 1

In [8]:
#count = sc.parallelize(range(0, 10)).filter(inside).count()