In [None]:
# List of files for which a histogram needs to be calculated. Each file should be a single-band file
# supported by GDAL.

files = [
    "/data/MTDA/TIFFDERIVED/PROBAV_L3_S1_TOC_333M/20160101/PROBAV_S1_TOC_20160101_333M_V001/PROBAV_S1_TOC_X18Y02_20160101_333M_V001_NDVI.tif"
]


In [None]:
# Calculates the histogram for a given (single band) image file.
def histogram(image_file):
    
    import numpy as np
    import gdal
    
    # Open image file
    img = gdal.Open(image_file)
    
    if not img:
        print '-ERROR- Unable to open image file "%s"' % image_file
    
    # Open raster band (first band)
    raster = img.GetRasterBand(1)    
    xSize = raster.RasterXSize
    ySize = raster.RasterYSize
    
    # Read raster data
    data = raster.ReadAsArray(0, 0, xSize, ySize)
        
    # Calculate histogram
    hist, _ = np.histogram(band, bins=256)
    return hist


In [None]:
# ================================================================
# === Calculate the histogram for a given number of files. The ===
# === processing is performed by spreading them over a cluster ===
# === of Spark nodes.                                          ===
# ================================================================

import pyspark
from pyspark.conf import SparkConf
from datetime import datetime
from operator import add

# Setup the Spark cluster
conf = SparkConf()
conf.set('spark.yarn.executor.memoryOverhead', 1024)
conf.set('spark.executor.memory', '8g')
conf.set('spark.executor.cores', '2')
conf.set('spark.executor.instances', 10)
sc = pyspark.SparkContext(conf=conf)

# Distribute the local file list over the cluster.
filesRDD = sc.parallelize(files)

# Apply the 'histogram' function to each filename using 'map', keep the result in memory using 'cache'.
hists = filesRDD.map(histogram).cache()

count = hists.count()

# Combine distributed histograms into a single result
total = hists.reduce(lambda h, i: map(add, h, i))

print "Sum of %i histograms: %s" % (count, total)