In [1]:
%pylab inline
import boto3
import io
import time
import struct

Populating the interactive namespace from numpy and matplotlib


In [2]:
client = boto3.client('s3')

In [5]:
x = np.random.randn(4096,4096)
bio = io.BytesIO()
np.save(bio, x)

In [6]:
client.put_object(Bucket="pictureweb", Key="s3_range_test/test", Body=bio.getvalue())

{'ETag': '"191f93c9c0c3da637b5415d489f7d00b"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Thu, 02 Aug 2018 14:35:33 GMT',
   'etag': '"191f93c9c0c3da637b5415d489f7d00b"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'Iu4c2GY6lS08HRfk1ckzeeoNknPyCexNgKjxE4QVPREcMRGMWvkZaS1mj0RGayaS5gJGHitGX3U=',
   'x-amz-request-id': '73F1C621419D5C29'},
  'HTTPStatusCode': 200,
  'HostId': 'Iu4c2GY6lS08HRfk1ckzeeoNknPyCexNgKjxE4QVPREcMRGMWvkZaS1mj0RGayaS5gJGHitGX3U=',
  'RequestId': '73F1C621419D5C29',
  'RetryAttempts': 0}}

In [7]:
def get_row_slow(row_idx):
    X = np.load(io.BytesIO(client.get_object(Bucket="pictureweb", Key="s3_range_test/test")["Body"].read()))
    return X[row_idx, :]

In [8]:
i = 0

In [9]:
t = time.time()
row = get_row_slow(i)
e = time.time()
runtime_slow = e - t

In [10]:
item_size = x.dtype.itemsize

In [11]:
MAGIC_LEN = 6
VERSION = 1
HEADER_LEN_SIZE = 2
HEADER_LEN_START = 8
HEADER_LEN_END = 9
HEADER_START = 10


In [12]:
def get_item_fast(row_idx, col_idx, num_columns, dtype):
    get_row_fast(row_idx, num_columns, dtype)[col_idx]
   



In [88]:
def get_row_fast(row_idx, num_columns, dtype):
    client = boto3.client('s3')
    range_query = 'bytes={0}-{1}'.format(HEADER_LEN_START, HEADER_LEN_END)
    header_len_bytes = client.get_object(Bucket="pictureweb", Key="s3_range_test/test", Range=range_query)["Body"].read()
    #header_bytes = struct.unpack("<H", header_len_bytes)[0]
    header_bytes = 118
    row_start = row_idx*(item_size * num_columns)
    row_end = (row_idx+1)*((item_size ) * num_columns) - 1
    query_start = HEADER_START + header_bytes + row_start
    query_end = HEADER_START + header_bytes + row_end
    range_query = 'bytes={0}-{1}'.format(query_start, query_end)
    row_bytes = client.get_object(Bucket="pictureweb", Key="s3_range_test/test", Range=range_query)["Body"].read()
    return np.frombuffer(row_bytes, dtype=dtype)
    






In [14]:
%timeit a = get_row_fast(8, x.shape[1], x.dtype)

30.5 ms ± 4.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%timeit b = get_row_slow(8)

1.44 s ± 381 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%time a = get_item_fast(8, 8, x.shape[1], x.dtype)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 32.3 ms


In [18]:
import concurrent.futures as fs


In [100]:
def get_rows(rows):
    t = time.time()
    executor = fs.ThreadPoolExecutor(128)
    futures = []
    for i in rows:
        futures.append(executor.submit(lambda : get_row_fast(i, x.shape[1], x.dtype)))
    fs.wait(futures)
    res = np.vstack([f.result() for f in futures])
    e = time.time()
    return e - t
    

In [101]:
%time get_rows(range(4096))

CPU times: user 1min 30s, sys: 3.53 s, total: 1min 34s
Wall time: 1min


60.422529220581055

In [65]:
import pywren

In [66]:
pwex  = pywren.default_executor()

In [70]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


In [94]:
lst = list(chunks(range(4096), 16))

In [95]:
futures = pwex.map(get_rows, range)

['/home/ubuntu/anaconda3/lib/python3.6/site-packages/colorama', '/home/ubuntu/anaconda3/lib/python3.6/site-packages/tornado', '/home/ubuntu/pywren/pywren', '/home/ubuntu/anaconda3/lib/python3.6/site-packages/watchtower']


In [96]:
pywren.wait(futures)

([<pywren.future.ResponseFuture at 0x7f71645bb048>,
  <pywren.future.ResponseFuture at 0x7f71645d9780>,
  <pywren.future.ResponseFuture at 0x7f71444f5128>,
  <pywren.future.ResponseFuture at 0x7f71443a0ef0>,
  <pywren.future.ResponseFuture at 0x7f70a46a7ef0>,
  <pywren.future.ResponseFuture at 0x7f71645d1a20>,
  <pywren.future.ResponseFuture at 0x7f71645eb828>,
  <pywren.future.ResponseFuture at 0x7f716441b0b8>,
  <pywren.future.ResponseFuture at 0x7f71644a6470>,
  <pywren.future.ResponseFuture at 0x7f71c99c3ba8>,
  <pywren.future.ResponseFuture at 0x7f716442cba8>,
  <pywren.future.ResponseFuture at 0x7f71444f5780>,
  <pywren.future.ResponseFuture at 0x7f71645601d0>,
  <pywren.future.ResponseFuture at 0x7f70a46a7470>,
  <pywren.future.ResponseFuture at 0x7f716451ff60>,
  <pywren.future.ResponseFuture at 0x7f71444f54e0>,
  <pywren.future.ResponseFuture at 0x7f71244104a8>,
  <pywren.future.ResponseFuture at 0x7f716441ddd8>,
  <pywren.future.ResponseFuture at 0x7f716441bc18>,
  <pywren.fu

In [98]:
[f.result() for f in futures]

[0.32804298400878906,
 0.3072659969329834,
 0.277820348739624,
 0.28248047828674316,
 0.44243383407592773,
 0.44734978675842285,
 0.3330690860748291,
 0.306105375289917,
 0.34697723388671875,
 0.33138489723205566,
 0.5544228553771973,
 0.34020113945007324,
 0.3995974063873291,
 0.31685447692871094,
 0.46636366844177246,
 0.30583930015563965,
 0.2819051742553711,
 0.32329750061035156,
 0.36934471130371094,
 0.3593568801879883,
 0.32886791229248047,
 0.48513078689575195,
 0.3755826950073242,
 0.30182456970214844,
 0.5377445220947266,
 0.3144850730895996,
 0.37277865409851074,
 0.3159608840942383,
 0.35961389541625977,
 0.30127549171447754,
 0.3126528263092041,
 0.29050421714782715,
 0.32579588890075684,
 0.39255666732788086,
 0.7942557334899902,
 0.3411600589752197,
 0.28508687019348145,
 1.642347812652588,
 0.2624175548553467,
 0.38858890533447266,
 0.3413522243499756,
 0.5189940929412842,
 0.26237940788269043,
 0.6211884021759033,
 0.31998300552368164,
 0.6526455879211426,
 0.304221868

In [83]:
%time get_rows(lst[0])

CPU times: user 92 ms, sys: 4 ms, total: 96 ms
Wall time: 97.5 ms


(array([[ 0.11357731,  0.21427875, -1.38108335, ...,  1.47540535,
         -1.39980345, -0.65518442],
        [ 0.15538261, -1.27996024, -1.04730028, ...,  0.63813038,
         -1.85526051,  0.2404578 ],
        [-1.40400004,  2.00847743, -0.04091358, ...,  0.66809395,
          0.32882512,  1.25866877],
        ...,
        [-0.94636264,  0.55736169, -0.44246253, ..., -1.04592457,
         -2.04484424,  0.16158721],
        [-0.09611194, -1.04524684,  0.76906967, ..., -0.28674769,
          1.02876332,  0.90276076],
        [-1.67893734,  0.44290122,  0.53214168, ...,  0.19703435,
          0.92653744,  0.4308833 ]]), 0.09736990928649902)

In [84]:
len(lst)

256