### QTM 350 Project
### Looking at the Global NOAA dataset

##### by Karoline Lin, Rachel Shanahan, John Cox, Erin O'Reilly, Hayley Roesler
##### on April 23, 2020

##### For our data science project, we decided to look at the NOAA-ids dataset found on AWS's open data registry. This dataset looks at 

In [2]:
import boto3
import botocore
import pandas as pd
from IPython.display import display, Markdown

In [3]:
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [4]:
def create_bucket(bucket):
    import logging

    try:
        s3.create_bucket(Bucket=bucket)
    except botocore.exceptions.ClientError as e:
        logging.error(e)
        return 'Bucket ' + bucket + ' could not be created.'
    return 'Created or already exists ' + bucket + ' bucket.'

In [5]:
create_bucket('open-data-analytics-noaa-global')

ERROR:root:An error occurred (BucketAlreadyExists) when calling the CreateBucket operation: The requested bucket name is not available. The bucket namespace is shared by all users of the system. Please select a different name and try again.


'Bucket open-data-analytics-noaa-global could not be created.'

In [6]:
def list_buckets(match=''):
    response = s3.list_buckets()
    if match:
        print(f'Existing buckets containing "{match}" string:')
    else:
        print('All existing buckets:')
    for bucket in response['Buckets']:
        if match:
            if match in bucket["Name"]:
                print(f'  {bucket["Name"]}')

In [7]:
list_buckets(match='open')

Existing buckets containing "open" string:


In [8]:
def list_bucket_contents(bucket, match='', size_mb=0):
    bucket_resource = s3_resource.Bucket(bucket)
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')

    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket {bucket} total size is {total_size_gb/1024:3.1f}GB with {total_files} files')

In [64]:
#str(list_bucket_contents(bucket='noaa-global-hourly-pds', match='2019/22019099999', size_mb=250)).split()
#this command above lists the contents and data size of each bucket; we put this command in markdown for now as it produces a huge number of CSV files, from which we chose two 

In [10]:
def preview_csv_dataset(bucket, key, rows=10):
    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    data = pd.read_csv(url, nrows=rows)
    return data

In [65]:
#preview_csv_dataset(bucket='noaa-global-hourly-pds', key='1930/99999996409.csv', rows=100)

In [97]:
######## NEW STUFF:
# starting data frame: station 62019099999, year 1950

df = preview_csv_dataset(bucket='noaa-global-hourly-pds', key= '1950/62019099999.csv', rows=100)
# create new column for year
df['YEAR'] = '1950'

In [98]:
# Looking at station 62019099999
# Combine every 10 years of station's data
year_list = [1960, 1973, 1980, 1990, 2000, 2010, 2020]
for year in year_list:
    df_temp = preview_csv_dataset(bucket='noaa-global-hourly-pds', key= str(year) + '/62019099999.csv', rows=100)
    df_temp['YEAR'] = str(year)
    df = pd.concat([df, df_temp])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [100]:
# clean up columns to eventually compare this station's data by year
columns = ['STATION', 'DATE', 'YEAR', 'SOURCE','LATITUDE','LONGITUDE','TMP','ELEVATION','WND','NAME']
df = pd.DataFrame(df, columns = columns)
df

Unnamed: 0,STATION,DATE,YEAR,SOURCE,LATITUDE,LONGITUDE,TMP,ELEVATION,WND,NAME
0,62019099999,1950-01-03T12:00:00,1950,4,31.2,16.583333,+01781,14.0,"270,1,N,0026,1","SIRTE, LY"
1,62019099999,1950-01-04T12:00:00,1950,4,31.2,16.583333,+01671,14.0,"290,1,N,0067,1","SIRTE, LY"
2,62019099999,1950-01-06T12:00:00,1950,4,31.2,16.583333,+01441,14.0,"110,1,N,0046,1","SIRTE, LY"
3,62019099999,1950-01-08T12:00:00,1950,4,31.2,16.583333,+01891,14.0,"340,1,N,0046,1","SIRTE, LY"
4,62019099999,1950-01-09T12:00:00,1950,4,31.2,16.583333,+01611,14.0,"230,1,N,0010,1","SIRTE, LY"
5,62019099999,1950-01-11T12:00:00,1950,4,31.2,16.583333,+02671,14.0,"080,1,N,0026,1","SIRTE, LY"
6,62019099999,1950-01-12T12:00:00,1950,4,31.2,16.583333,+01721,14.0,"310,1,N,0010,1","SIRTE, LY"
7,62019099999,1950-01-13T12:00:00,1950,4,31.2,16.583333,+01721,14.0,"050,1,N,0031,1","SIRTE, LY"
8,62019099999,1950-01-17T12:00:00,1950,4,31.2,16.583333,+01611,14.0,"200,1,N,0046,1","SIRTE, LY"
9,62019099999,1950-01-18T12:00:00,1950,4,31.2,16.583333,+99999,14.0,"160,1,N,0026,1","SIRTE, LY"


In [101]:
df.describe()

Unnamed: 0,STATION,SOURCE,LATITUDE,LONGITUDE,ELEVATION
count,800.0,800.0,800.0,800.0,800.0
mean,62019100000.0,4.0,31.2,16.58333,14.0
std,0.0,0.0,4.47922e-13,3.021696e-13,0.0
min,62019100000.0,4.0,31.2,16.58333,14.0
25%,62019100000.0,4.0,31.2,16.58333,14.0
50%,62019100000.0,4.0,31.2,16.58333,14.0
75%,62019100000.0,4.0,31.2,16.58333,14.0
max,62019100000.0,4.0,31.2,16.58333,14.0
