In [4]:
import boto3
import botocore
import pandas as pd
from IPython.display import display, Markdown

In [5]:
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [6]:
def create_bucket(bucket):
    import logging

    try:
        s3.create_bucket(Bucket=bucket)
    except botocore.exceptions.ClientError as e:
        logging.error(e)
        return 'Bucket ' + bucket + ' could not be created.'
    return 'Created or already exists ' + bucket + ' bucket.'

In [7]:
create_bucket('open-data-analytics-noaa-global')

'Created or already exists open-data-analytics-noaa-global bucket.'

In [8]:
def list_buckets(match=''):
    response = s3.list_buckets()
    if match:
        print(f'Existing buckets containing "{match}" string:')
    else:
        print('All existing buckets:')
    for bucket in response['Buckets']:
        if match:
            if match in bucket["Name"]:
                print(f'  {bucket["Name"]}')

In [9]:
list_buckets(match='open')

Existing buckets containing "open" string:
  open-data-analytics-beat-aml
  open-data-analytics-nasa-nex
  open-data-analytics-new-afsis
  open-data-analytics-noaa-global
  open-data-analytics-nyc-tlc
  open-data-analytics-open-aq
  open-data-analytics-open-aq-fetches


In [10]:
def list_bucket_contents(bucket, match='', size_mb=0):
    bucket_resource = s3_resource.Bucket(bucket)
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')

    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket {bucket} total size is {total_size_gb/1024:3.1f}GB with {total_files} files')

In [11]:
list_bucket_contents(bucket='noaa-global-hourly-pds', match='2019', size_mb=250)

1932/22019099999.csv (  0MB)
1933/22019099999.csv (  0MB)
1934/22019099999.csv (  0MB)
1935/22019099999.csv (  0MB)
1936/22019099999.csv (  0MB)
1941/78520199999.csv (  1MB)
1942/78520199999.csv (  2MB)
1943/78520199999.csv (  2MB)
1944/78520199999.csv (  2MB)
1945/78520199999.csv (  2MB)
1949/62019099999.csv (  0MB)
1950/62019099999.csv (  0MB)
1951/62019099999.csv (  0MB)
1952/62019099999.csv (  0MB)
1953/62019099999.csv (  0MB)
1954/62019099999.csv (  0MB)
1955/20199099999.csv (  0MB)
1955/62019099999.csv (  0MB)
1955/71820199999.csv (  0MB)
1955/71920199999.csv (  0MB)
1956/20199099999.csv (  0MB)
1956/62019099999.csv (  1MB)
1956/71820199999.csv (  0MB)
1956/71920199999.csv (  0MB)
1957/20199099999.csv (  0MB)
1957/62019099999.csv (  1MB)
1957/71820199999.csv (  0MB)
1957/71920199999.csv (  0MB)
1958/20199099999.csv (  0MB)
1958/62019099999.csv (  1MB)
1958/71820199999.csv (  0MB)
1958/71920199999.csv (  0MB)
1959/20199099999.csv (  0MB)
1959/22019099999.csv (  0MB)
1959/620190999

In [12]:
def preview_csv_dataset(bucket, key, rows=10):
    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    data = pd.read_csv(url, nrows=rows)
    return data

In [13]:
df = preview_csv_dataset(bucket='noaa-global-hourly-pds', key='2019/99999996409.csv', rows=100)

In [14]:
df.head()

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,...,KB2,KB3,KC1,KC2,KD1,KD2,KE1,KF1,OB1,EQD
0,99999996409,2019-01-01T00:00:00,I,68.6483,-149.3986,750.1,"TOOLIK LAKE 5 ENE, AK US",CRN05,99999,V020,...,,,,,,,,-541.0,0600105109999000103109999990,
1,99999996409,2019-01-01T00:05:00,I,68.6483,-149.3986,750.1,"TOOLIK LAKE 5 ENE, AK US",CRN05,99999,V020,...,,,,,,,,,,
2,99999996409,2019-01-01T00:10:00,I,68.6483,-149.3986,750.1,"TOOLIK LAKE 5 ENE, AK US",CRN05,99999,V020,...,,,,,,,,,,
3,99999996409,2019-01-01T00:15:00,I,68.6483,-149.3986,750.1,"TOOLIK LAKE 5 ENE, AK US",CRN05,99999,V020,...,,,,,,,,,,
4,99999996409,2019-01-01T00:20:00,I,68.6483,-149.3986,750.1,"TOOLIK LAKE 5 ENE, AK US",CRN05,99999,V020,...,,,,,,,,,,
