In [94]:
import json, glob, boto3, os
import pdb
import pandas as pd
from pandas.io.json import json_normalize

In [95]:
# from https://alexwlchan.net/2019/07/listing-s3-keys/
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    kwargs = {'Bucket': bucket}

    # We can pass the prefix directly to the S3 API.  If the user has passed
    # a tuple or list of prefixes, we go through them one by one.
    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    for key_prefix in prefixes:
        kwargs["Prefix"] = key_prefix

        for page in paginator.paginate(**kwargs):
            try:
                contents = page["Contents"]
            except KeyError:
                return

            for obj in contents:
                key = obj["Key"]
                if key.endswith(suffix):
                    yield obj


def get_matching_s3_keys(bucket, prefix="", suffix=""):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj["Key"]

In [96]:
session = boto3.Session()
BUCKET_NAME = 'snowbot-pv'

# S3 Connect
s3 = session.resource('s3')

bucket = s3.Bucket(BUCKET_NAME)

In [97]:
DATA_DIR = "./data/"
MERGED_FILENAME = "merged_file.json"
merged_file = DATA_DIR + MERGED_FILENAME

TEST_FILENAME = "test_file.json"
merged_test_file = DATA_DIR + TEST_FILENAME


In [98]:
def merge_matching_jsons(save_file, suffix=""):
    
    result = []

    for f in get_matching_s3_keys(BUCKET_NAME, suffix=suffix):

        # Write the file from S3 into a local temp file
        with open('temp', 'wb') as tfw:
            bucket.download_fileobj(f, tfw)

        # Append the local temp file into the result list
        with open('temp', 'rb') as tfr:          
            result.append(json.load(tfr))

    os.remove("temp")

    # Fill the output file with the merged content
    with open(save_file, "w") as outfile:
         json.dump(result, outfile)
            
# TBD: more efficient to go straight to df w/o saving json to file

In [99]:
def load_merged_json_as_df(merged_file):
    #load the merged json as a dataframe
    with open(merged_file, "r") as f:
        d = json.load(f)
        df = pd.DataFrame.from_dict(json_normalize(d, record_path='lifts', meta='timestamp'))
        return df

In [100]:
def set_lifts_df_datatypes(df):

    # set datatypes for lift table
    df = df.astype({
        "liftID": 'category',
        "resortID": 'category',
        "liftName": 'category',
        "status": 'category',
        "timeToRide": "int"
    })
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    
    return df

In [295]:
def get_status_changes(df):
    '''Returns a dataframe that only includes the times when there was a change to a new status'''
    
    def calc_status_change(df):
        firstrow = df.loc[df['timestamp'].idxmin()]
        change_rows = df[df.status.ne(df.status.shift())]
        keep_df = firstrow.to_frame().T.append(change_rows)
        
        # Remove so that we don't need to write another column to S3 as we scrape?
        # Just calculate it when plotting and predicting?
        # keep_df['time_diff'] = keep_df['timestamp'].diff(1).shift(-1)
        
        return keep_df
        
    
    df = df.groupby('liftName', group_keys=False)\
           .apply(calc_status_change)\
           .reset_index(drop=True)
    return df

# TBD: may need to convert timestam to days (e.g. for Tableau)

In [214]:
merge_matching_jsons(suffix="lifts.json", save_file=merged_file)
lifts_df = load_merged_json_as_df(merged_file)
lifts_df = set_lifts_df_datatypes(lifts_df)

In [363]:
lifts_status_changes_df = get_status_changes(lifts_df)
lifts_status_changes_df

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,3,13,7th Heaven Express,X,6,2020-01-03 00:19:09.631011-08:00
1,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00
2,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00
3,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00
4,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00
...,...,...,...,...,...,...
512,72,13,Whistler Village Gondola Upper,O,11,2020-01-09 08:30:26.730631-08:00
513,72,13,Whistler Village Gondola Upper,X,11,2020-01-09 15:15:26.908233-08:00
514,72,13,Whistler Village Gondola Upper,H,11,2020-01-10 07:45:26.349643-08:00
515,72,13,Whistler Village Gondola Upper,O,11,2020-01-10 08:30:26.179537-08:00


**NOTE:** `timeToRide` is just the time is takes to ride the lift, not the current wait time:

In [298]:
lifts_df.groupby("liftName")['timeToRide'].unique()

liftName
7th Heaven Express                 [6]
Big Red Express                    [8]
Blackcomb Gondola Lower            [7]
Blackcomb Gondola Upper            [7]
Catskinner Express                 [4]
Coca-Cola Tube Park                [4]
Creekside Gondola                  [7]
Crystal Ridge Express              [7]
Emerald 6 Express                  [6]
Excalibur Gondola Lower            [3]
Excalibur Gondola Upper            [5]
Excelerator Express                [6]
Fitzsimmons Express                [6]
Franz's Chair                      [8]
Garbanzo Express                   [7]
Glacier Express                    [6]
Harmony 6 Express                  [6]
Horstman T-Bar                     [4]
Jersey Cream Express               [5]
Magic Chair                        [6]
Olympic Chair                      [5]
Peak 2 Peak Gondola               [12]
Peak Express                       [3]
Showcase T-Bar                     [3]
Symphony Express                   [7]
T-Bars          

In [464]:
df = lifts_status_changes_df.sort_values(by=['liftID','timestamp'])
df['time_diff'] = df.groupby('liftID')['timestamp'].diff(1).shift(-1)

In [430]:
def get_missing_time_diffs_df(df):
    return df.loc[(df['time_diff'].isnull()) & (
        df['timestamp'] > df['timestamp'].min()), 'timestamp']

In [471]:
missing_time_diffs_idx = df.loc[(df['time_diff'].isnull()) & (
        df['timestamp'] > df['timestamp'].min()), 'timestamp'].index.values

In [473]:
missing_time_diffs_idx

array([ 15, 388, 224, 412, 361, 274, 112, 174, 129, 454, 488, 154, 199,
        42, 436, 352, 462, 325, 298,  67,  89, 249, 516])

In [475]:
df.loc[missing_time_diffs_idx, 'time_diff'] = df['timestamp'].max() - df.loc[missing_time_diffs_idx, 'timestamp']

In [461]:
df['time_diff'][missing_time_diffs_idx] = df['timestamp'].max() - df['timestamp'][missing_time_diffs_idx]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [479]:
#get_missing_time_diffs_df(df) = (df['timestamp'].max() - get_missing_time_diffs_df(df))

df['time_diff_seconds'] = df['time_diff'].astype('int')/1000000000

In [476]:
df

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp,time_diff
0,3,13,7th Heaven Express,X,6,2020-01-03 00:19:09.631011-08:00,1 days 13:59:59.794440
1,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00,0 days 00:29:59.297179
2,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00,0 days 22:00:01.500183
3,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00,0 days 01:59:58.326201
4,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00,0 days 18:56:18.469982
...,...,...,...,...,...,...,...
512,72,13,Whistler Village Gondola Upper,O,11,2020-01-09 08:30:26.730631-08:00,0 days 06:45:00.177602
513,72,13,Whistler Village Gondola Upper,X,11,2020-01-09 15:15:26.908233-08:00,0 days 16:29:59.441410
514,72,13,Whistler Village Gondola Upper,H,11,2020-01-10 07:45:26.349643-08:00,0 days 00:44:59.829894
515,72,13,Whistler Village Gondola Upper,O,11,2020-01-10 08:30:26.179537-08:00,0 days 06:45:00.093875


In [None]:
df.diff()
df.shift()

In [382]:
df.loc[(df['time_diff'].isnull()) & (df['timestamp'] != df['timestamp'].min())]

liftID        23
resortID      23
liftName      23
status        23
timeToRide    23
timestamp     23
time_diff      0
dtype: int64

In [388]:
df['timestamp'].max() - df.loc[df['time_diff'].isnull(), 'timestamp']

15    0 days 03:59:59.313599
388   0 days 03:29:59.797863
224          0 days 00:00:00
412   0 days 01:29:59.706931
361   0 days 03:59:59.313599
274   0 days 03:29:59.797863
112   0 days 03:29:59.797863
174   0 days 03:29:59.797863
363   7 days 18:26:16.440264
456   7 days 18:26:16.440264
129   0 days 00:29:59.867162
454   0 days 03:29:59.797863
488   0 days 03:29:59.797863
154   0 days 03:29:59.797863
199   0 days 03:29:59.797863
42    0 days 03:29:59.797863
362   7 days 18:26:16.440264
436   0 days 03:29:59.797863
352   0 days 03:29:59.797863
462   1 days 03:59:59.244357
457   7 days 18:26:16.440264
455   7 days 18:26:16.440264
325   0 days 03:29:59.797863
298   0 days 05:29:59.539052
67    0 days 03:29:59.797863
89    0 days 03:29:59.797863
249   0 days 03:29:59.797863
516   0 days 03:29:59.797863
Name: timestamp, dtype: timedelta64[ns]

In [366]:
df.loc[df['time_diff'].isnull(), 'time_diff'] = df.loc[df['time_diff'].isnull(), 'timestamp']

15    NaT
388   NaT
224   NaT
412   NaT
361   NaT
274   NaT
112   NaT
174   NaT
363   NaT
456   NaT
129   NaT
454   NaT
488   NaT
154   NaT
199   NaT
42    NaT
362   NaT
436   NaT
352   NaT
462   NaT
457   NaT
455   NaT
325   NaT
298   NaT
67    NaT
89    NaT
249   NaT
516   NaT
Name: time_diff, dtype: timedelta64[ns]

In [400]:
display(df)

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp,time_diff,time_diff_seconds
0,3,13,7th Heaven Express,X,6,2020-01-03 00:19:09.631011-08:00,1 days 13:59:59.794440,1.367998e+05
1,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00,0 days 00:29:59.297179,1.799297e+03
2,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00,0 days 22:00:01.500183,7.920150e+04
3,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00,0 days 01:59:58.326201,7.198326e+03
4,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00,0 days 18:56:18.469982,6.817847e+04
...,...,...,...,...,...,...,...,...
512,72,13,Whistler Village Gondola Upper,O,11,2020-01-09 08:30:26.730631-08:00,0 days 06:45:00.177602,2.430018e+04
513,72,13,Whistler Village Gondola Upper,X,11,2020-01-09 15:15:26.908233-08:00,0 days 16:29:59.441410,5.939944e+04
514,72,13,Whistler Village Gondola Upper,H,11,2020-01-10 07:45:26.349643-08:00,0 days 00:44:59.829894,2.699830e+03
515,72,13,Whistler Village Gondola Upper,O,11,2020-01-10 08:30:26.179537-08:00,0 days 06:45:00.093875,2.430009e+04


In [300]:
df.dtypes

liftID                                        object
resortID                                      object
liftName                                      object
status                                        object
timeToRide                                    object
timestamp     datetime64[ns, pytz.FixedOffset(-480)]
time_diff                            timedelta64[ns]
dtype: object

In [480]:
# Uses locale date formatting, otherwise Tableau will mix up month and day
# alternatively, can export to json:
# lifts_status_changes_df.to_json(DATA_DIR + "lifts_status_changes.json", orient='table')
df.to_csv(DATA_DIR + "lifts_status_changes.csv", date_format='%c')


In [9]:
# add:
# 
# daily: for each chair calculate most open status of the day: O > H > X
# Days since each chair was last seen open with timestamp of most recent open time.
# snowfall since last open
# save data for other mountains

# Storage options testing

In [11]:
df.to_pickle(DATA_DIR + "df_test.pkl")

In [32]:
from fastparquet import write

# parquet engines don't handle shifted timezones
import pytz
TZ = pytz.timezone('America/Vancouver')
df['timestamp'] = df.timestamp.dt.tz_convert(pytz.utc)

# Note: May need snappy-python as a req to run on AWS Lambda
df.to_parquet(DATA_DIR + "df_test.parquet", engine='fastparquet')


In [35]:
load_df = pd.read_parquet(DATA_DIR + "df_test.parquet")
load_df['timestamp'] = load_df.timestamp.dt.tz_convert(TZ) # convert back to correct timezone


In [37]:
#TBD convert back to correct datatypes
load_df.dtypes

liftID                                    int64
resortID                                  int64
liftName                                 object
status                                   object
timeToRide                                int64
timestamp     datetime64[ns, America/Vancouver]
dtype: object

In [16]:
df.to_csv(DATA_DIR + "df_test.csv")

Test file size results:
- json: 800 Kb?
- csv: 474 Kb
- pickle: 145 Kb
- parquet: 15 Kb

## Delta Lake Testing

Requires apache spark instance.  For future use, could set one up to work with lambda using https://aws.amazon.com/emr/features/spark/?

Otherwise databricks (similar to QxMD project)

## json comparison

In [51]:
a = json.loads("""
{
    "timestamp": "2020-01-03 00:19:09.631011-08:00", "lifts": [{"liftID": 69, "resortID": 13, "liftName": "Blackcomb Gondola Lower", "status": "X", "timeToRide": "7"}, {"liftID": 70, "resortID": 13, "liftName": "Blackcomb Gondola Upper", "status": "X", "timeToRide": "7"}, {"liftID": 5, "resortID": 13, "liftName": "Excalibur Gondola Lower", "status": "X", "timeToRide": "3"}, {"liftID": 71, "resortID": 13, "liftName": "Excalibur Gondola Upper", "status": "X", "timeToRide": "5"}, {"liftID": 8, "resortID": 13, "liftName": "Excelerator Express", "status": "X", "timeToRide": "6"}, {"liftID": 6, "resortID": 13, "liftName": "Magic Chair", "status": "X", "timeToRide": "6"}, {"liftID": 4, "resortID": 13, "liftName": "Jersey Cream Express", "status": "X", "timeToRide": "5"}, {"liftID": 9, "resortID": 13, "liftName": "Catskinner Express", "status": "X", "timeToRide": "4"}, {"liftID": 22, "resortID": 13, "liftName": "Peak 2 Peak Gondola", "status": "X", "timeToRide": "12"}, {"liftID": 10, "resortID": 13, "liftName": "Crystal Ridge Express", "status": "X", "timeToRide": "7"}, {"liftID": 7, "resortID": 13, "liftName": "Glacier Express", "status": "X", "timeToRide": "6"}, {"liftID": 3, "resortID": 13, "liftName": "7th Heaven Express", "status": "X", "timeToRide": "6"}, {"liftID": 12, "resortID": 13, "liftName": "Showcase T-Bar", "status": "X", "timeToRide": "3"}, {"liftID": 11, "resortID": 13, "liftName": "Horstman T-Bar", "status": "X", "timeToRide": "4"}, {"liftID": 14, "resortID": 13, "liftName": "Coca-Cola Tube Park", "status": "X", "timeToRide": "4"}, {"liftID": 33, "resortID": 13, "liftName": "Whistler Village Gondola Lower", "status": "X", "timeToRide": "5"}, {"liftID": 72, "resortID": 13, "liftName": "Whistler Village Gondola Upper", "status": "X", "timeToRide": "11"}, {"liftID": 45, "resortID": 13, "liftName": "Fitzsimmons Express", "status": "X", "timeToRide": "6"}, {"liftID": 40, "resortID": 13, "liftName": "Garbanzo Express", "status": "X", "timeToRide": "7"}, {"liftID": 34, "resortID": 13, "liftName": "Creekside Gondola", "status": "X", "timeToRide": "7"}, {"liftID": 36, "resortID": 13, "liftName": "Big Red Express", "status": "X", "timeToRide": "8"}, {"liftID": 35, "resortID": 13, "liftName": "Emerald 6 Express", "status": "X", "timeToRide": "6"}, {"liftID": 22, "resortID": 13, "liftName": "Peak 2 Peak Gondola", "status": "X", "timeToRide": "12"}, {"liftID": 39, "resortID": 13, "liftName": "Olympic Chair", "status": "X", "timeToRide": "5"}, {"liftID": 44, "resortID": 13, "liftName": "Franz's Chair", "status": "X", "timeToRide": "8"}, {"liftID": 43, "resortID": 13, "liftName": "Peak Express", "status": "X", "timeToRide": "3"}, {"liftID": 37, "resortID": 13, "liftName": "Harmony 6 Express", "status": "X", "timeToRide": "6"}, {"liftID": 42, "resortID": 13, "liftName": "Symphony Express", "status": "X", "timeToRide": "7"}, {"liftID": 41, "resortID": 13, "liftName": "T-Bars", "status": "X", "timeToRide": "5"}]
}
""")

# Changed Blackcomb Gondola Lower status to "O"
b = json.loads("""
{
    "timestamp": "2020-01-03 00:19:09.631011-08:00", "lifts": [{"liftID": 69, "resortID": 13, "liftName": "Blackcomb Gondola Lower", "status": "O", "timeToRide": "7"}, {"liftID": 70, "resortID": 13, "liftName": "Blackcomb Gondola Upper", "status": "X", "timeToRide": "7"}, {"liftID": 5, "resortID": 13, "liftName": "Excalibur Gondola Lower", "status": "X", "timeToRide": "3"}, {"liftID": 71, "resortID": 13, "liftName": "Excalibur Gondola Upper", "status": "X", "timeToRide": "5"}, {"liftID": 8, "resortID": 13, "liftName": "Excelerator Express", "status": "X", "timeToRide": "6"}, {"liftID": 6, "resortID": 13, "liftName": "Magic Chair", "status": "X", "timeToRide": "6"}, {"liftID": 4, "resortID": 13, "liftName": "Jersey Cream Express", "status": "X", "timeToRide": "5"}, {"liftID": 9, "resortID": 13, "liftName": "Catskinner Express", "status": "X", "timeToRide": "4"}, {"liftID": 22, "resortID": 13, "liftName": "Peak 2 Peak Gondola", "status": "X", "timeToRide": "12"}, {"liftID": 10, "resortID": 13, "liftName": "Crystal Ridge Express", "status": "X", "timeToRide": "7"}, {"liftID": 7, "resortID": 13, "liftName": "Glacier Express", "status": "X", "timeToRide": "6"}, {"liftID": 3, "resortID": 13, "liftName": "7th Heaven Express", "status": "X", "timeToRide": "6"}, {"liftID": 12, "resortID": 13, "liftName": "Showcase T-Bar", "status": "X", "timeToRide": "3"}, {"liftID": 11, "resortID": 13, "liftName": "Horstman T-Bar", "status": "X", "timeToRide": "4"}, {"liftID": 14, "resortID": 13, "liftName": "Coca-Cola Tube Park", "status": "X", "timeToRide": "4"}, {"liftID": 33, "resortID": 13, "liftName": "Whistler Village Gondola Lower", "status": "X", "timeToRide": "5"}, {"liftID": 72, "resortID": 13, "liftName": "Whistler Village Gondola Upper", "status": "X", "timeToRide": "11"}, {"liftID": 45, "resortID": 13, "liftName": "Fitzsimmons Express", "status": "X", "timeToRide": "6"}, {"liftID": 40, "resortID": 13, "liftName": "Garbanzo Express", "status": "X", "timeToRide": "7"}, {"liftID": 34, "resortID": 13, "liftName": "Creekside Gondola", "status": "X", "timeToRide": "7"}, {"liftID": 36, "resortID": 13, "liftName": "Big Red Express", "status": "X", "timeToRide": "8"}, {"liftID": 35, "resortID": 13, "liftName": "Emerald 6 Express", "status": "X", "timeToRide": "6"}, {"liftID": 22, "resortID": 13, "liftName": "Peak 2 Peak Gondola", "status": "X", "timeToRide": "12"}, {"liftID": 39, "resortID": 13, "liftName": "Olympic Chair", "status": "X", "timeToRide": "5"}, {"liftID": 44, "resortID": 13, "liftName": "Franz's Chair", "status": "X", "timeToRide": "8"}, {"liftID": 43, "resortID": 13, "liftName": "Peak Express", "status": "X", "timeToRide": "3"}, {"liftID": 37, "resortID": 13, "liftName": "Harmony 6 Express", "status": "X", "timeToRide": "6"}, {"liftID": 42, "resortID": 13, "liftName": "Symphony Express", "status": "X", "timeToRide": "7"}, {"liftID": 41, "resortID": 13, "liftName": "T-Bars", "status": "X", "timeToRide": "5"}]
}
""")

In [52]:
a == b

False

In [104]:
merge_matching_jsons(suffix="test.json", save_file=merged_test_file)
df_test = load_merged_json_as_df(merged_test_file)
df_test = set_lifts_df_datatypes(df_test)
lifts_status_changes_df_test = get_status_changes(df_test)

In [105]:
lifts_status_changes_df.append(lifts_status_changes_df_test)

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,3,13,7th Heaven Express,O,6,2020-01-04 14:19:09.425451-08:00
1,3,13,7th Heaven Express,X,6,2020-01-04 14:49:08.722630-08:00
2,3,13,7th Heaven Express,O,6,2020-01-05 12:49:10.222813-08:00
3,3,13,7th Heaven Express,X,6,2020-01-05 14:49:08.549014-08:00
4,3,13,7th Heaven Express,O,6,2020-01-06 09:45:27.018996-08:00
...,...,...,...,...,...,...
482,72,13,Whistler Village Gondola Upper,X,11,2020-01-09 15:15:26.908233-08:00
483,72,13,Whistler Village Gondola Upper,H,11,2020-01-10 07:45:26.349643-08:00
484,72,13,Whistler Village Gondola Upper,O,11,2020-01-10 08:30:26.179537-08:00
485,72,13,Whistler Village Gondola Upper,X,11,2020-01-10 15:15:26.273412-08:00


In [None]:
# Get current lift status info json
# retrieve prior lift status info json
# if current == prior end
    # else:
        # merge_json(current and prior)
        # get status changes
        # append status changes to S3 lift history parquet table (create if it doesn't exist)
            # append via https://github.com/pandas-dev/pandas/issues/20638#issuecomment-386754025 ?
        # save current lift status info json as prior
        

In [1]:
import os

os.chdir("./snowbot_AWS_lambda/")

from scrape import get_data

os.chdir("..")

In [2]:
!pwd

/Users/paul/dev/snowbot


In [158]:
def jsons_to_df(jsons):
    # repeats. functionize first occurace?
    return pd.DataFrame.from_dict(json_normalize(result, record_path='lifts', meta='timestamp'))

In [162]:
jsons_to_df([json_content, lifts_current_json])
    

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,5,13,Excalibur Gondola Lower,O,3,2020-01-10 16:16:41.775318-08:00
1,6,13,Magic Chair,O,6,2020-01-10 16:16:41.775318-08:00
2,14,13,Coca-Cola Tube Park,O,4,2020-01-10 16:16:41.775318-08:00
3,69,13,Blackcomb Gondola Lower,X,7,2020-01-10 16:16:41.775318-08:00
4,70,13,Blackcomb Gondola Upper,X,7,2020-01-10 16:16:41.775318-08:00
5,71,13,Excalibur Gondola Upper,X,5,2020-01-10 16:16:41.775318-08:00
6,8,13,Excelerator Express,X,6,2020-01-10 16:16:41.775318-08:00
7,4,13,Jersey Cream Express,X,5,2020-01-10 16:16:41.775318-08:00
8,9,13,Catskinner Express,X,4,2020-01-10 16:16:41.775318-08:00
9,22,13,Peak 2 Peak Gondola,X,12,2020-01-10 16:16:41.775318-08:00


In [163]:
get_status_changes(df)

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,3,13,7th Heaven Express,X,6,2020-01-10 16:16:41.775318-08:00
1,36,13,Big Red Express,X,8,2020-01-10 16:16:41.775318-08:00
2,69,13,Blackcomb Gondola Lower,X,7,2020-01-10 16:16:41.775318-08:00
3,70,13,Blackcomb Gondola Upper,X,7,2020-01-10 16:16:41.775318-08:00
4,9,13,Catskinner Express,X,4,2020-01-10 16:16:41.775318-08:00
5,14,13,Coca-Cola Tube Park,O,4,2020-01-10 16:16:41.775318-08:00
6,34,13,Creekside Gondola,X,7,2020-01-10 16:16:41.775318-08:00
7,10,13,Crystal Ridge Express,X,7,2020-01-10 16:16:41.775318-08:00
8,35,13,Emerald 6 Express,X,6,2020-01-10 16:16:41.775318-08:00
9,5,13,Excalibur Gondola Lower,O,3,2020-01-10 16:16:41.775318-08:00


In [None]:
json.loads()
    
    
json.dump(result)

In [160]:
# Get current lift status info json
lifts_current = get_data()['lifts'] # String.  
lifts_current_json = json.loads(lifts_current)

In [161]:
# retrieve prior lift status info json

import botocore


def save_prior(json_data):
    bucket.put_object(Key="lifts_prior.json", Body=bytes(json.dumps(json_data).encode('UTF-8')))


lifts_prior_object = s3.Object(BUCKET_NAME, 'lifts_prior.json')

try:
    lifts_prior_object.load()
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":

        print("Prior doesn't exist")
        save_prior(lifts_current_json)  # Create the prior file
        print("Created PRIOR_FILENAME_TBD")
    else:
        # Something else has gone wrong.
        raise
else:
    # The prior exists
    file_content = lifts_prior_object.get()['Body'].read().decode('utf-8')
    json_content = json.loads(file_content)
    print("loaded prior json data from S3")
    
    # compare jsons without their timestamps
    if json_content['lifts'] == lifts_current_json['lifts']:
        print("No differences between current and prior data were found.")
    else:
        pass
        

loaded prior json data from S3


In [131]:

json_content['lifts'] != lifts_current_json['lifts']

False

In [132]:
json_content

{'timestamp': '2020-01-10 16:16:41.775318-08:00',
 'lifts': [{'liftID': 5,
   'resortID': 13,
   'liftName': 'Excalibur Gondola Lower',
   'status': 'O',
   'timeToRide': '3'},
  {'liftID': 6,
   'resortID': 13,
   'liftName': 'Magic Chair',
   'status': 'O',
   'timeToRide': '6'},
  {'liftID': 14,
   'resortID': 13,
   'liftName': 'Coca-Cola Tube Park',
   'status': 'O',
   'timeToRide': '4'},
  {'liftID': 69,
   'resortID': 13,
   'liftName': 'Blackcomb Gondola Lower',
   'status': 'X',
   'timeToRide': '7'},
  {'liftID': 70,
   'resortID': 13,
   'liftName': 'Blackcomb Gondola Upper',
   'status': 'X',
   'timeToRide': '7'},
  {'liftID': 71,
   'resortID': 13,
   'liftName': 'Excalibur Gondola Upper',
   'status': 'X',
   'timeToRide': '5'},
  {'liftID': 8,
   'resortID': 13,
   'liftName': 'Excelerator Express',
   'status': 'X',
   'timeToRide': '6'},
  {'liftID': 4,
   'resortID': 13,
   'liftName': 'Jersey Cream Express',
   'status': 'X',
   'timeToRide': '5'},
  {'liftID': 9,


In [133]:
lifts_current_json

{'timestamp': '2020-01-10 16:21:18.282773-08:00',
 'lifts': [{'liftID': 5,
   'resortID': 13,
   'liftName': 'Excalibur Gondola Lower',
   'status': 'O',
   'timeToRide': '3'},
  {'liftID': 6,
   'resortID': 13,
   'liftName': 'Magic Chair',
   'status': 'O',
   'timeToRide': '6'},
  {'liftID': 14,
   'resortID': 13,
   'liftName': 'Coca-Cola Tube Park',
   'status': 'O',
   'timeToRide': '4'},
  {'liftID': 69,
   'resortID': 13,
   'liftName': 'Blackcomb Gondola Lower',
   'status': 'X',
   'timeToRide': '7'},
  {'liftID': 70,
   'resortID': 13,
   'liftName': 'Blackcomb Gondola Upper',
   'status': 'X',
   'timeToRide': '7'},
  {'liftID': 71,
   'resortID': 13,
   'liftName': 'Excalibur Gondola Upper',
   'status': 'X',
   'timeToRide': '5'},
  {'liftID': 8,
   'resortID': 13,
   'liftName': 'Excelerator Express',
   'status': 'X',
   'timeToRide': '6'},
  {'liftID': 4,
   'resortID': 13,
   'liftName': 'Jersey Cream Express',
   'status': 'X',
   'timeToRide': '5'},
  {'liftID': 9,


In [68]:
type(json_content)

dict

In [70]:
type(lifts_current)

str

In [None]:
# Write the file from S3 into a local temp file
with open('temp', 'wb') as tfw:
    bucket.download_fileobj(f, tfw)