In [1]:
import json, glob, boto3, os
import pdb
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
# from https://alexwlchan.net/2019/07/listing-s3-keys/
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    kwargs = {'Bucket': bucket}

    # We can pass the prefix directly to the S3 API.  If the user has passed
    # a tuple or list of prefixes, we go through them one by one.
    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    for key_prefix in prefixes:
        kwargs["Prefix"] = key_prefix

        for page in paginator.paginate(**kwargs):
            try:
                contents = page["Contents"]
            except KeyError:
                return

            for obj in contents:
                key = obj["Key"]
                if key.endswith(suffix):
                    yield obj


def get_matching_s3_keys(bucket, prefix="", suffix=""):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj["Key"]

In [3]:
session = boto3.Session()
BUCKET_NAME = 'snowbot-pv'

# S3 Connect
s3 = session.resource('s3')

bucket = s3.Bucket(BUCKET_NAME)

In [20]:
DATA_DIR = "./data/"
MERGED_FILENAME = "merged_file.json"
merged_file = DATA_DIR + MERGED_FILENAME

result = []

for f in get_matching_s3_keys(BUCKET_NAME, suffix="lifts.json"):
    
    # Write the file from S3 into a local temp file
    with open('temp', 'wb') as tfw:
        bucket.download_fileobj(f, tfw)

    # Append the local temp file into the result list
    with open('temp', 'rb') as tfr:          
        result.append(json.load(tfr))
        
os.remove("temp")

# Fill the output file with the merged content
with open(merged_file, "w") as outfile:
     json.dump(result, outfile)

In [21]:
#load the merged json as a dataframe
with open(merged_file, "r") as f:
    d = json.load(f)
    df = pd.DataFrame.from_dict(json_normalize(d, record_path='lifts', meta='timestamp'))

In [22]:
# set datatypes
df = df.astype({
    "liftID": 'category',
    "resortID": 'category',
    "liftName": 'category',
    "status": 'category',
    "timeToRide": "int"
})
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [7]:
def get_status_changes(df):
    '''Returns a dataframe that only includes the times when there was a change to a new status'''
    df = df.groupby('liftName', group_keys=False)\
           .apply(lambda x: x[x.status.ne(x.status.shift())])\
           .reset_index(drop=True)
    return df

In [8]:
get_status_changes(df)

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00
1,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00
2,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00
3,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00
4,3,13,7th Heaven Express,O,6,2020-01-06 09:45:27.018996-08:00
...,...,...,...,...,...,...
230,72,13,Whistler Village Gondola Upper,O,11,2020-01-05 08:19:08.762177-08:00
231,72,13,Whistler Village Gondola Upper,X,11,2020-01-05 15:19:08.683322-08:00
232,72,13,Whistler Village Gondola Upper,H,11,2020-01-06 08:00:28.936358-08:00
233,72,13,Whistler Village Gondola Upper,O,11,2020-01-06 08:30:29.146383-08:00


In [9]:
# add:
# 
# daily: for each chair calculate most open status of the day: O > H > X
# Days since each chair was last seen open with timestamp of most recent open time.
# snowfall since last open
# save data for other mountains

In [10]:
df

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,69,13,Blackcomb Gondola Lower,X,7,2020-01-03 00:19:09.631011-08:00
1,70,13,Blackcomb Gondola Upper,X,7,2020-01-03 00:19:09.631011-08:00
2,5,13,Excalibur Gondola Lower,X,3,2020-01-03 00:19:09.631011-08:00
3,71,13,Excalibur Gondola Upper,X,5,2020-01-03 00:19:09.631011-08:00
4,8,13,Excelerator Express,X,6,2020-01-03 00:19:09.631011-08:00
...,...,...,...,...,...,...
7100,44,13,Franz's Chair,X,8,2020-01-06 20:30:26.468003-08:00
7101,43,13,Peak Express,X,3,2020-01-06 20:30:26.468003-08:00
7102,37,13,Harmony 6 Express,X,6,2020-01-06 20:30:26.468003-08:00
7103,42,13,Symphony Express,X,7,2020-01-06 20:30:26.468003-08:00


In [11]:
df.to_pickle(DATA_DIR + "df_test.pkl")

In [32]:
from fastparquet import write

# parquet engine don't handle shifted timezones
import pytz
TZ = pytz.timezone('America/Vancouver')
df['timestamp'] = df.timestamp.dt.tz_convert(pytz.utc)

df.to_parquet(DATA_DIR + "df_test.parquet", engine='fastparquet')

In [35]:
load_df = pd.read_parquet(DATA_DIR + "df_test.parquet")
load_df['timestamp'] = load_df.timestamp.dt.tz_convert(TZ) # convert back to correct timezone


In [37]:
#TBD convert back to correct datatypes
load_df.dtypes

liftID                                    int64
resortID                                  int64
liftName                                 object
status                                   object
timeToRide                                int64
timestamp     datetime64[ns, America/Vancouver]
dtype: object

In [16]:
df.to_csv(DATA_DIR + "df_test.csv")