## Using Boto to connect with S3

This notebook demonstrates how to use Boto to interact with S3.

Here are pointers to the boto documentation:
* [API Document](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html)
* [Examples](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html)

In [11]:
!ls ~/VaultDSC291/
from time import time
%pwd

credentials.sh creds.yaml~    newCreds.txt
creds.yaml     neighbors.txt  newCreds.txt~


'/Users/yoavfreund/academic.papers/Courses/BigDataAnalytics/DSC291_2020/BigData-Spark-private/notebooks/Section1-Basics/0.MemoryLatency'

In [12]:
# read credentials from yaml file
import yaml
#yaml_file='/home/ubuntu/VaultDSC291/creds.yaml'
yaml_file='/Users/yoavfreund/VaultDSC291/creds.yaml'
with open(yaml_file,'rb') as yml:
    creds=yaml.load(yml,Loader=yaml.FullLoader)
creds=creds['Yoav-DSC291']

creds.keys()

dict_keys(['access_key_id', 'secret_access_key', 'email_address', 'key_name', 'ssh_key'])

In [13]:
import boto3

s3 = boto3.resource(
    's3',
    aws_access_key_id=creds['access_key_id'],
    aws_secret_access_key=creds['secret_access_key'],
)

## List all buckets

In [14]:
for bucket in s3.buckets.all():
    print(bucket)

s3.Bucket(name='aws-logs-891063097137-us-west-2')
s3.Bucket(name='dsc291-blocks')
s3.Bucket(name='yoavfreundtest')


## List all files in a bucket

In [15]:
bucket_name = "yoavfreundtest"
bucket = s3.Bucket(bucket_name)
keys=[]
for file_object in bucket.objects.limit(20):
    print(file_object)
    key=file_object.key
    if 'scratch' in key:
        keys.append(key)
keys

s3.ObjectSummary(bucket_name='yoavfreundtest', key='measurement_logs/BlockData10')
s3.ObjectSummary(bucket_name='yoavfreundtest', key='measurement_logs/BlockData1000')
s3.ObjectSummary(bucket_name='yoavfreundtest', key='measurement_logs/BlockData1000000')
s3.ObjectSummary(bucket_name='yoavfreundtest', key='measurement_logs/stats.pkl')
s3.ObjectSummary(bucket_name='yoavfreundtest', key='scratch/measurement_logs/BlockData1000')
s3.ObjectSummary(bucket_name='yoavfreundtest', key='scratch/measurement_logs/BlockData1000000')
s3.ObjectSummary(bucket_name='yoavfreundtest', key='scratch/measurement_logs/BlockData1000000000')
s3.ObjectSummary(bucket_name='yoavfreundtest', key='test_block')


['scratch/measurement_logs/BlockData1000',
 'scratch/measurement_logs/BlockData1000000',
 'scratch/measurement_logs/BlockData1000000000']

In [16]:
pairs=[('scratch/measurement_logs/BlockData1000', 2000),
 ('scratch/measurement_logs/BlockData1000000', 1000006),
 ('scratch/measurement_logs/BlockData1000000000', 1000000000)]
pairs

[('scratch/measurement_logs/BlockData1000', 2000),
 ('scratch/measurement_logs/BlockData1000000', 1000006),
 ('scratch/measurement_logs/BlockData1000000000', 1000000000)]

## Read an object to memory

In [23]:
#%%timeit
for key,length in pairs[:2]:
    t0=time()
    obj = s3.Object(bucket_name, key)
    obj_in_mem = obj.get()['Body'].read().decode('utf-8')
    t1=time()
    length=len(obj_in_mem)
    print('%s, len=%d,time=%.3g, time per byte=%.3g'%(key,length,t1-t0,(t1-t0)/length))

scratch/measurement_logs/BlockData1000, len=2000,time=1.14, time per byte=0.000569
scratch/measurement_logs/BlockData1000000, len=1000006,time=1.5, time per byte=1.5e-06


[('scratch/measurement_logs/BlockData1000', 2000),
 ('scratch/measurement_logs/BlockData1000000', 1000006),
 ('scratch/measurement_logs/BlockData1000000000', 1000000000)]

4000014

## Download an object to a local file on disk

In [18]:
import os

In [24]:
#%%timeit
for key,length in pairs[:2]:
    t0=time()
    obj = s3.Object(bucket_name, key)
    obj.download_file(os.path.expanduser("~/block"))
    t1=time()
    print('%s, len=%d,time=%.3g, time per byte=%.3g'%(key,length,t1-t0,(t1-t0)/length))

scratch/measurement_logs/BlockData1000, len=2000,time=0.324, time per byte=0.000162
scratch/measurement_logs/BlockData1000000, len=1000006,time=0.441, time per byte=4.41e-07


In [25]:
!ls -lh ~/block

-rw-r--r--  1 yoavfreund  staff   977K Apr  7 12:19 /Users/yoavfreund/block


## Upload file from local disk to bucket

In [31]:
!mkdir /tmp/measurement_logs/
!ln -s /tmp scratch

mkdir: /tmp/measurement_logs/: File exists


In [32]:
filenames=!ls -l scratch/measurement_logs/
filenames

[]

In [31]:
#%%timeit
from glob import glob
from time import time

s3 = boto3.client('s3',
    aws_access_key_id=creds['access_key_id'],
    aws_secret_access_key=creds['secret_access_key'])

for file in glob('scratch/measurement_logs/*'):
    print(file)
    t0=time()
    with open(file, "rb") as f:
        s3.upload_fileobj(f, bucket_name, file)
    t1=time()
    print('%s total time=%f'%(file,t1-t0))

scratch/measurement_logs/BlockData1000000000
scratch/measurement_logs/BlockData1000000000 total time=10.217659
scratch/measurement_logs/BlockData1000000
scratch/measurement_logs/BlockData1000000 total time=0.093103
scratch/measurement_logs/BlockData1000
scratch/measurement_logs/BlockData1000 total time=0.028431
scratch/measurement_logs/BlockData1000000000
scratch/measurement_logs/BlockData1000000000 total time=14.292300
scratch/measurement_logs/BlockData1000000
scratch/measurement_logs/BlockData1000000 total time=0.098198
scratch/measurement_logs/BlockData1000
scratch/measurement_logs/BlockData1000 total time=0.087480
scratch/measurement_logs/BlockData1000000000
scratch/measurement_logs/BlockData1000000000 total time=6.865798
scratch/measurement_logs/BlockData1000000
scratch/measurement_logs/BlockData1000000 total time=0.043784
scratch/measurement_logs/BlockData1000
scratch/measurement_logs/BlockData1000 total time=0.029240
scratch/measurement_logs/BlockData1000000000
scratch/measureme

In [18]:
3.8 / 9.42

FileNotFoundError: [Errno 2] No such file or directory: '/Users/yoavfreund/block'