In [57]:
from dask_gateway import Gateway
from dask_gateway import GatewayCluster
from dask.distributed import Client

import numpy

import os
import sys

import boto3

In [59]:
def get_s3_keys(bucket, s3_client, prefix = ''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    """
    
    kwargs = {'Bucket': bucket}

    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    done = False
    while not done:
        resp = s3_client.list_objects_v2(**kwargs)
        try:
            for obj in resp['Contents']:
                key = obj['Key']
                if key.startswith(prefix):
                    # print('key: ',key)
                    yield key
        except KeyError:
            print('Empty response from s3 for bucket %s with prefix %s'%(bucket,prefix))
            break

        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break
            


In [2]:
gateway = Gateway()

In [3]:
gateway

Gateway<http://10.100.222.95:8000/services/dask-gateway>

In [4]:
options = gateway.cluster_options(); options

VBox(children=(HTML(value='<h2>Cluster Options</h2>'), GridBox(children=(HTML(value="<p style='font-weight: bo…

In [5]:
cluster = gateway.new_cluster(); cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [6]:
clusters = gateway.list_clusters(); clusters

[ClusterReport<name=daskhub.2ac95993865040b1a152cbe59787f92f, status=RUNNING>]

In [7]:
gateway_uri = "gateway://traefik-dhub-dask-gateway.daskhub:80/daskhub."+gateway.list_clusters()[0].name.split('.')[1]; gateway_uri

'gateway://traefik-dhub-dask-gateway.daskhub:80/daskhub.2ac95993865040b1a152cbe59787f92f'

In [8]:
# client = cluster.get_client(gateway_uri);
client = cluster.get_client();


+---------+--------+-----------+---------+
| Package | client | scheduler | workers |
+---------+--------+-----------+---------+
| numpy   | 1.19.2 | 1.19.1    | 1.19.1  |
+---------+--------+-----------+---------+


In [9]:
client

0,1
Client  Scheduler: gateway://traefik-dhub-dask-gateway.daskhub:80/daskhub.2ac95993865040b1a152cbe59787f92f  Dashboard: /services/dask-gateway/clusters/daskhub.2ac95993865040b1a152cbe59787f92f/status,Cluster  Workers: 2  Cores: 4  Memory: 8.59 GB


In [10]:
import dask.array as da
a = da.random.normal(size=(1000,1000),chunks=(50,50))
a.mean().compute()

-0.0004546136669163475

In [11]:
client.nthreads()

{'tls://192.168.18.45:41611': 2, 'tls://192.168.29.137:33389': 2}

In [12]:
# Dask helpers
def slam(client,action,data,partition_factor=1.5,dbg=0):
    np = sum(client.nthreads().values())
    if dbg>0: # a
        print('slam: np = %i'%np)
    shard_bounds = [int(i*len(data)/(1.0*partition_factor*np)) for i in range(int(partition_factor*np))] 
    if shard_bounds[-1] != len(data):
        if dbg>0: # b
            print('a sb[-1]: ',shard_bounds[-1],len(data))
        shard_bounds = shard_bounds + [len(data)]
        if dbg>0: # c
            print('sb: ',shard_bounds)
    data_shards = [data[shard_bounds[i]:shard_bounds[i+1]] for i in range(len(shard_bounds)-1)]
    if dbg>0: # d
        print('ds len:        ',len(data_shards))
        print('ds item len:   ',len(data_shards[0]))
        print('ds type:       ',type(data_shards[0]))
        try:
            print('ds dtype:      ',data_shards[0].dtype)
        except:
            pass
    big_future = client.scatter(data_shards)
    results    = client.map(action,big_future)
    return results


In [64]:
def action(data_shard):
    ret = []
    for d in data_shard:
        if False:
            ret.append((d,'touched',os.getcwd(),os.environ))
        if False:
            ret.append((d,'touched',os.getcwd(),os.listdir()))
        if False:
            filename='.bashrc'
            with open(filename, 'r') as file:
                data = file.read()
            ret.append((d,'read .bashrc',data))
        if False:
            ret.append((d,os.environ['DASK_PARENT'],os.environ['DASK_GATEWAY_WORKER_NAME']))
        if True:
            client = boto3.client('s3')
            keys = get_s3_keys('daskhub-data',client,prefix='')
            ret.append(d,list(keys)[0:10])
            
    return ret

In [67]:
client = boto3.client('s3')
keys = get_s3_keys('daskhub-data',client,prefix='')
list(keys)[0:10]

['GESDISC/MERRA2/2019/MERRA2_400.tavg1_2d_slv_Nx.20191203.nc4',
 'GESDISC/MERRA2/2019/MERRA2_400.tavg1_2d_slv_Nx.20191222.nc4',
 'GESDISC/MERRA2/2019/MERRA2_400.tavg1_2d_slv_Nx.20191226.nc4',
 'MODAPS/CATALOGS/MOD05_L2/2019/356/catalogue.sqlite',
 'MODAPS/MOD05/MOD05_L2.A2019336.2315.061.2019337071952.hdf',
 'MODAPS/MOD05/MOD05_L2.A2019336.2315.061.2019337071952_stare.nc',
 'MODAPS/MOD05/MOD05_L2.A2019336.2320.061.2019337072008.hdf',
 'MODAPS/MOD05/MOD05_L2.A2019336.2320.061.2019337072008_stare.nc',
 'MODAPS/MOD05/MOD05_L2.A2019336.2325.061.2019337072403.hdf',
 'MODAPS/MOD05/MOD05_L2.A2019336.2325.061.2019337072403_stare.nc']

In [16]:
data = numpy.arange(100)

In [65]:
results = slam(client,action,data)

In [19]:
results

[<Future: finished, type: builtins.list, key: action-d13456636380c5d7ba81d76d14f39c37>,
 <Future: finished, type: builtins.list, key: action-71398c698a1ad727d747b45a59265fd1>,
 <Future: finished, type: builtins.list, key: action-d4c21e1420827d18150b623dfe4ea004>,
 <Future: finished, type: builtins.list, key: action-a8b264b2bd70f3700d6df5cb54c8b6dd>,
 <Future: finished, type: builtins.list, key: action-88bd98a497933c627955d0ed91b51252>,
 <Future: finished, type: builtins.list, key: action-fba7c6da2cb328a41e19e37bd44743e4>]

In [66]:
client.gather(results)
# print(results[0].result()[0][2])

ClientError: An error occurred (AccessDenied) when calling the ListObjectsV2 operation: Access Denied

In [12]:
cluster.shutdown()

In [14]:
cluster.close()

In [15]:
gateway.list_clusters()

[]

In [None]:
cluster = gateway.connect(clusters[0].name)

In [None]:
cluster.shutdown(); cluster.close()

In [14]:
gateway.close()

In [16]:
gateway.list_clusters()

[]

In [None]:
gateway