In [1]:
import json, glob, boto3, os
import pdb
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
# from https://alexwlchan.net/2019/07/listing-s3-keys/
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    kwargs = {'Bucket': bucket}

    # We can pass the prefix directly to the S3 API.  If the user has passed
    # a tuple or list of prefixes, we go through them one by one.
    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    for key_prefix in prefixes:
        kwargs["Prefix"] = key_prefix

        for page in paginator.paginate(**kwargs):
            try:
                contents = page["Contents"]
            except KeyError:
                return

            for obj in contents:
                key = obj["Key"]
                if key.endswith(suffix):
                    yield obj


def get_matching_s3_keys(bucket, prefix="", suffix=""):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj["Key"]

In [3]:
session = boto3.Session()
BUCKET_NAME = 'snowbot-pv'

# S3 Connect
s3 = session.resource('s3')

bucket = s3.Bucket(BUCKET_NAME)

In [4]:
DATA_DIR = "./data/"
MERGED_FILENAME = "merged_file.json"
merged_file = DATA_DIR + MERGED_FILENAME

TEST_FILENAME = "test_file.json"
merged_test_file = DATA_DIR + TEST_FILENAME


In [5]:
def merge_json(save_file, suffix=""):
    
    result = []

    for f in get_matching_s3_keys(BUCKET_NAME, suffix=suffix):

        # Write the file from S3 into a local temp file
        with open('temp', 'wb') as tfw:
            bucket.download_fileobj(f, tfw)

        # Append the local temp file into the result list
        with open('temp', 'rb') as tfr:          
            result.append(json.load(tfr))

    os.remove("temp")

    # Fill the output file with the merged content
    with open(save_file, "w") as outfile:
         json.dump(result, outfile)
            
# TBD: more efficient to go straight to df w/o saving json to file

In [6]:
def load_merged_json_as_df(merged_file):
    #load the merged json as a dataframe
    with open(merged_file, "r") as f:
        d = json.load(f)
        df = pd.DataFrame.from_dict(json_normalize(d, record_path='lifts', meta='timestamp'))
        return df

In [7]:
def set_lifts_df_datatypes(df):

    # set datatypes for lift table
    df = df.astype({
        "liftID": 'category',
        "resortID": 'category',
        "liftName": 'category',
        "status": 'category',
        "timeToRide": "int"
    })
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    
    return df

In [8]:
def get_status_changes(df):
    '''Returns a dataframe that only includes the times when there was a change to a new status'''
    df = df.groupby('liftName', group_keys=False)\
           .apply(lambda x: x[x.status.ne(x.status.shift())])\
           .reset_index(drop=True)
    return df

In [9]:
merge_json(suffix="lifts.json", save_file=merged_file)
lifts_df = load_merged_json_as_df(merged_file)
lifts_df = set_lifts_df_datatypes(lifts_df)

In [10]:
lifts_status_changes_df = get_status_changes(lifts_df)
lifts_status_changes_df

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00
1,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00
2,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00
3,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00
4,3,13,7th Heaven Express,O,6,2020-01-06 09:45:27.018996-08:00
...,...,...,...,...,...,...
420,72,13,Whistler Village Gondola Upper,O,11,2020-01-08 08:30:26.621928-08:00
421,72,13,Whistler Village Gondola Upper,X,11,2020-01-08 15:15:27.215231-08:00
422,72,13,Whistler Village Gondola Upper,H,11,2020-01-09 07:45:26.717896-08:00
423,72,13,Whistler Village Gondola Upper,O,11,2020-01-09 08:30:26.730631-08:00


In [51]:
df = lifts_status_changes_df.sort_values(by=['liftID','timestamp'])
df['time_diff'] = df.groupby('liftID')['timestamp'].diff(1).shift(-1)
display(df)


Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp,time_diff
0,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00,00:29:59.297179
1,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00,22:00:01.500183
2,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00,01:59:58.326201
3,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00,18:56:18.469982
4,3,13,7th Heaven Express,O,6,2020-01-06 09:45:27.018996-08:00,05:00:00.822932
...,...,...,...,...,...,...,...
420,72,13,Whistler Village Gondola Upper,O,11,2020-01-08 08:30:26.621928-08:00,06:45:00.593303
421,72,13,Whistler Village Gondola Upper,X,11,2020-01-08 15:15:27.215231-08:00,16:29:59.502665
422,72,13,Whistler Village Gondola Upper,H,11,2020-01-09 07:45:26.717896-08:00,00:45:00.012735
423,72,13,Whistler Village Gondola Upper,O,11,2020-01-09 08:30:26.730631-08:00,06:45:00.177602


In [52]:
df.dtypes

liftID                                      category
resortID                                    category
liftName                                    category
status                                      category
timeToRide                                     int64
timestamp     datetime64[ns, pytz.FixedOffset(-480)]
time_diff                            timedelta64[ns]
dtype: object

In [53]:
df['time_diff_seconds'] = df['time_diff'].astype('int')/1000000000

In [54]:
df

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp,time_diff,time_diff_seconds
0,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00,00:29:59.297179,1.799297e+03
1,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00,22:00:01.500183,7.920150e+04
2,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00,01:59:58.326201,7.198326e+03
3,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00,18:56:18.469982,6.817847e+04
4,3,13,7th Heaven Express,O,6,2020-01-06 09:45:27.018996-08:00,05:00:00.822932,1.800082e+04
...,...,...,...,...,...,...,...,...
420,72,13,Whistler Village Gondola Upper,O,11,2020-01-08 08:30:26.621928-08:00,06:45:00.593303,2.430059e+04
421,72,13,Whistler Village Gondola Upper,X,11,2020-01-08 15:15:27.215231-08:00,16:29:59.502665,5.939950e+04
422,72,13,Whistler Village Gondola Upper,H,11,2020-01-09 07:45:26.717896-08:00,00:45:00.012735,2.700013e+03
423,72,13,Whistler Village Gondola Upper,O,11,2020-01-09 08:30:26.730631-08:00,06:45:00.177602,2.430018e+04


In [55]:
# Uses locale date formatting, otherwise Tableau will mix up month and day
# alternatively, can export to json:
# lifts_status_changes_df.to_json(DATA_DIR + "lifts_status_changes.json", orient='table')
df.to_csv(DATA_DIR + "lifts_status_changes.csv", date_format='%c')


In [9]:
# add:
# 
# daily: for each chair calculate most open status of the day: O > H > X
# Days since each chair was last seen open with timestamp of most recent open time.
# snowfall since last open
# save data for other mountains

# Storage options testing

In [11]:
df.to_pickle(DATA_DIR + "df_test.pkl")

In [32]:
from fastparquet import write

# parquet engines don't handle shifted timezones
import pytz
TZ = pytz.timezone('America/Vancouver')
df['timestamp'] = df.timestamp.dt.tz_convert(pytz.utc)

# Note: May need snappy-python as a req to run on AWS Lambda
df.to_parquet(DATA_DIR + "df_test.parquet", engine='fastparquet')


In [35]:
load_df = pd.read_parquet(DATA_DIR + "df_test.parquet")
load_df['timestamp'] = load_df.timestamp.dt.tz_convert(TZ) # convert back to correct timezone


In [37]:
#TBD convert back to correct datatypes
load_df.dtypes

liftID                                    int64
resortID                                  int64
liftName                                 object
status                                   object
timeToRide                                int64
timestamp     datetime64[ns, America/Vancouver]
dtype: object

In [16]:
df.to_csv(DATA_DIR + "df_test.csv")

## Delta Lake Testing

Requires apache spark instance.  For future use, could set one up to work with lambda using https://aws.amazon.com/emr/features/spark/?

Otherwise databricks (similar to QxMD project)

## json comparison

In [51]:
a = json.loads("""
{
    "timestamp": "2020-01-03 00:19:09.631011-08:00", "lifts": [{"liftID": 69, "resortID": 13, "liftName": "Blackcomb Gondola Lower", "status": "X", "timeToRide": "7"}, {"liftID": 70, "resortID": 13, "liftName": "Blackcomb Gondola Upper", "status": "X", "timeToRide": "7"}, {"liftID": 5, "resortID": 13, "liftName": "Excalibur Gondola Lower", "status": "X", "timeToRide": "3"}, {"liftID": 71, "resortID": 13, "liftName": "Excalibur Gondola Upper", "status": "X", "timeToRide": "5"}, {"liftID": 8, "resortID": 13, "liftName": "Excelerator Express", "status": "X", "timeToRide": "6"}, {"liftID": 6, "resortID": 13, "liftName": "Magic Chair", "status": "X", "timeToRide": "6"}, {"liftID": 4, "resortID": 13, "liftName": "Jersey Cream Express", "status": "X", "timeToRide": "5"}, {"liftID": 9, "resortID": 13, "liftName": "Catskinner Express", "status": "X", "timeToRide": "4"}, {"liftID": 22, "resortID": 13, "liftName": "Peak 2 Peak Gondola", "status": "X", "timeToRide": "12"}, {"liftID": 10, "resortID": 13, "liftName": "Crystal Ridge Express", "status": "X", "timeToRide": "7"}, {"liftID": 7, "resortID": 13, "liftName": "Glacier Express", "status": "X", "timeToRide": "6"}, {"liftID": 3, "resortID": 13, "liftName": "7th Heaven Express", "status": "X", "timeToRide": "6"}, {"liftID": 12, "resortID": 13, "liftName": "Showcase T-Bar", "status": "X", "timeToRide": "3"}, {"liftID": 11, "resortID": 13, "liftName": "Horstman T-Bar", "status": "X", "timeToRide": "4"}, {"liftID": 14, "resortID": 13, "liftName": "Coca-Cola Tube Park", "status": "X", "timeToRide": "4"}, {"liftID": 33, "resortID": 13, "liftName": "Whistler Village Gondola Lower", "status": "X", "timeToRide": "5"}, {"liftID": 72, "resortID": 13, "liftName": "Whistler Village Gondola Upper", "status": "X", "timeToRide": "11"}, {"liftID": 45, "resortID": 13, "liftName": "Fitzsimmons Express", "status": "X", "timeToRide": "6"}, {"liftID": 40, "resortID": 13, "liftName": "Garbanzo Express", "status": "X", "timeToRide": "7"}, {"liftID": 34, "resortID": 13, "liftName": "Creekside Gondola", "status": "X", "timeToRide": "7"}, {"liftID": 36, "resortID": 13, "liftName": "Big Red Express", "status": "X", "timeToRide": "8"}, {"liftID": 35, "resortID": 13, "liftName": "Emerald 6 Express", "status": "X", "timeToRide": "6"}, {"liftID": 22, "resortID": 13, "liftName": "Peak 2 Peak Gondola", "status": "X", "timeToRide": "12"}, {"liftID": 39, "resortID": 13, "liftName": "Olympic Chair", "status": "X", "timeToRide": "5"}, {"liftID": 44, "resortID": 13, "liftName": "Franz's Chair", "status": "X", "timeToRide": "8"}, {"liftID": 43, "resortID": 13, "liftName": "Peak Express", "status": "X", "timeToRide": "3"}, {"liftID": 37, "resortID": 13, "liftName": "Harmony 6 Express", "status": "X", "timeToRide": "6"}, {"liftID": 42, "resortID": 13, "liftName": "Symphony Express", "status": "X", "timeToRide": "7"}, {"liftID": 41, "resortID": 13, "liftName": "T-Bars", "status": "X", "timeToRide": "5"}]
}
""")

# Changed Blackcomb Gondola Lower status to "O"
b = json.loads("""
{
    "timestamp": "2020-01-03 00:19:09.631011-08:00", "lifts": [{"liftID": 69, "resortID": 13, "liftName": "Blackcomb Gondola Lower", "status": "O", "timeToRide": "7"}, {"liftID": 70, "resortID": 13, "liftName": "Blackcomb Gondola Upper", "status": "X", "timeToRide": "7"}, {"liftID": 5, "resortID": 13, "liftName": "Excalibur Gondola Lower", "status": "X", "timeToRide": "3"}, {"liftID": 71, "resortID": 13, "liftName": "Excalibur Gondola Upper", "status": "X", "timeToRide": "5"}, {"liftID": 8, "resortID": 13, "liftName": "Excelerator Express", "status": "X", "timeToRide": "6"}, {"liftID": 6, "resortID": 13, "liftName": "Magic Chair", "status": "X", "timeToRide": "6"}, {"liftID": 4, "resortID": 13, "liftName": "Jersey Cream Express", "status": "X", "timeToRide": "5"}, {"liftID": 9, "resortID": 13, "liftName": "Catskinner Express", "status": "X", "timeToRide": "4"}, {"liftID": 22, "resortID": 13, "liftName": "Peak 2 Peak Gondola", "status": "X", "timeToRide": "12"}, {"liftID": 10, "resortID": 13, "liftName": "Crystal Ridge Express", "status": "X", "timeToRide": "7"}, {"liftID": 7, "resortID": 13, "liftName": "Glacier Express", "status": "X", "timeToRide": "6"}, {"liftID": 3, "resortID": 13, "liftName": "7th Heaven Express", "status": "X", "timeToRide": "6"}, {"liftID": 12, "resortID": 13, "liftName": "Showcase T-Bar", "status": "X", "timeToRide": "3"}, {"liftID": 11, "resortID": 13, "liftName": "Horstman T-Bar", "status": "X", "timeToRide": "4"}, {"liftID": 14, "resortID": 13, "liftName": "Coca-Cola Tube Park", "status": "X", "timeToRide": "4"}, {"liftID": 33, "resortID": 13, "liftName": "Whistler Village Gondola Lower", "status": "X", "timeToRide": "5"}, {"liftID": 72, "resortID": 13, "liftName": "Whistler Village Gondola Upper", "status": "X", "timeToRide": "11"}, {"liftID": 45, "resortID": 13, "liftName": "Fitzsimmons Express", "status": "X", "timeToRide": "6"}, {"liftID": 40, "resortID": 13, "liftName": "Garbanzo Express", "status": "X", "timeToRide": "7"}, {"liftID": 34, "resortID": 13, "liftName": "Creekside Gondola", "status": "X", "timeToRide": "7"}, {"liftID": 36, "resortID": 13, "liftName": "Big Red Express", "status": "X", "timeToRide": "8"}, {"liftID": 35, "resortID": 13, "liftName": "Emerald 6 Express", "status": "X", "timeToRide": "6"}, {"liftID": 22, "resortID": 13, "liftName": "Peak 2 Peak Gondola", "status": "X", "timeToRide": "12"}, {"liftID": 39, "resortID": 13, "liftName": "Olympic Chair", "status": "X", "timeToRide": "5"}, {"liftID": 44, "resortID": 13, "liftName": "Franz's Chair", "status": "X", "timeToRide": "8"}, {"liftID": 43, "resortID": 13, "liftName": "Peak Express", "status": "X", "timeToRide": "3"}, {"liftID": 37, "resortID": 13, "liftName": "Harmony 6 Express", "status": "X", "timeToRide": "6"}, {"liftID": 42, "resortID": 13, "liftName": "Symphony Express", "status": "X", "timeToRide": "7"}, {"liftID": 41, "resortID": 13, "liftName": "T-Bars", "status": "X", "timeToRide": "5"}]
}
""")

In [52]:
a == b

False

In [93]:
merge_json(suffix="test.json", save_file=merged_test_file)
df_test = load_merged_json_as_df(merged_test_file)
df_test = set_lifts_df_datatypes(df_test)
lifts_status_changes_df_test = get_status_changes(df_test)

In [94]:
lifts_status_changes_df.append(lifts_status_changes_df_test)

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00
1,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00
2,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00
3,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00
4,3,13,7th Heaven Express,O,6,2020-01-06 09:45:27.018996-08:00
...,...,...,...,...,...,...
272,72,13,Whistler Village Gondola Upper,O,11,2020-01-06 08:30:29.146383-08:00
273,72,13,Whistler Village Gondola Upper,X,11,2020-01-06 15:15:26.008153-08:00
274,72,13,Whistler Village Gondola Upper,H,11,2020-01-07 07:45:26.653351-08:00
275,72,13,Whistler Village Gondola Upper,O,11,2020-01-07 08:45:26.226551-08:00


In [None]:
# Get current lift status info json
# retrieve prior lift status info json
# if current == prior end
    # else:
        # merge_json(current and prior)
        # get status changes
        # append status changes to S3 lift history parquet table (create if it doesn't exist)
            # append via https://github.com/pandas-dev/pandas/issues/20638#issuecomment-386754025 ?
        # save current lift status info json as prior
        

In [107]:
import os

os.chdir("./snowbot_AWS_lambda/")

from scrape import get_data

os.chdir("..")

In [108]:
!pwd

/Users/paul/dev/snowbot


In [111]:
get_data()['lifts']

'{"timestamp": "2020-01-07 15:57:45.271748-08:00", "lifts": [{"liftID": 5, "resortID": 13, "liftName": "Excalibur Gondola Lower", "status": "O", "timeToRide": "3"}, {"liftID": 6, "resortID": 13, "liftName": "Magic Chair", "status": "O", "timeToRide": "6"}, {"liftID": 14, "resortID": 13, "liftName": "Coca-Cola Tube Park", "status": "O", "timeToRide": "4"}, {"liftID": 69, "resortID": 13, "liftName": "Blackcomb Gondola Lower", "status": "X", "timeToRide": "7"}, {"liftID": 70, "resortID": 13, "liftName": "Blackcomb Gondola Upper", "status": "X", "timeToRide": "7"}, {"liftID": 71, "resortID": 13, "liftName": "Excalibur Gondola Upper", "status": "X", "timeToRide": "5"}, {"liftID": 8, "resortID": 13, "liftName": "Excelerator Express", "status": "X", "timeToRide": "6"}, {"liftID": 4, "resortID": 13, "liftName": "Jersey Cream Express", "status": "X", "timeToRide": "5"}, {"liftID": 9, "resortID": 13, "liftName": "Catskinner Express", "status": "X", "timeToRide": "4"}, {"liftID": 22, "resortID": 

In [115]:
import botocore

try:
    s3.Object(BUCKET_NAME, 'lifts_prior.json').load()
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        # The object does not exist.
        print("doesn't exist")
        # TBD create the file
    else:
        # Something else has gone wrong.
        raise
else:
    # The object does exist.
    # TBD compare jsons
    

doesn't exists


In [None]:
# Write the file from S3 into a local temp file
with open('temp', 'wb') as tfw:
    bucket.download_fileobj(f, tfw)