In [1]:
import json, glob, boto3, os
import pdb
import pandas as pd

# Testing out json processing

In [2]:
session = boto3.Session()
BUCKET_NAME = 'snowbot-pv'

# S3 Connect
s3 = session.resource('s3')

bucket = s3.Bucket(BUCKET_NAME)

In [3]:
# parquet engines don't handle shifted timezones
TZ = pytz.timezone('America/Vancouver')

DATA_DIR = "../data/"
MERGED_FILENAME = "merged_file.json"
merged_file = DATA_DIR + MERGED_FILENAME

TEST_FILENAME = "test_file.json"
merged_test_file = DATA_DIR + TEST_FILENAME


In [230]:
# from https://alexwlchan.net/2019/07/listing-s3-keys/
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    kwargs = {'Bucket': bucket}

    # We can pass the prefix directly to the S3 API.  If the user has passed
    # a tuple or list of prefixes, we go through them one by one.
    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    for key_prefix in prefixes:
        kwargs["Prefix"] = key_prefix

        for page in paginator.paginate(**kwargs):
            try:
                contents = page["Contents"]
            except KeyError:
                return

            for obj in contents:
                key = obj["Key"]
                if key.endswith(suffix):
                    yield obj


def get_matching_s3_keys(bucket, prefix="", suffix=""):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj["Key"]


def merge_matching_jsons(save_file, suffix=""):

    result = []

    for f in get_matching_s3_keys(BUCKET_NAME, suffix=suffix):

        # Write the file from S3 into a local temp file
        with open('temp', 'wb') as tfw:
            bucket.download_fileobj(f, tfw)

        # Append the local temp file into the result list
        with open('temp', 'rb') as tfr:
            result.append(json.load(tfr))

    os.remove("temp")

    # Fill the output file with the merged content
    with open(save_file, "w") as outfile:
        json.dump(result, outfile)

# TBD: more efficient to go straight to df w/o saving json to file


def set_lifts_df_datatypes(df):

    # Important to set categories because when writing incrementally to parquet, some increments
    # may not include all statuses.  Manually setting the categories avoids errors due to
    # different catergory indexing between increments.
    status_cat_dtype = pd.api.types.CategoricalDtype(
        categories=['X', 'H', 'O'], ordered=True)

    # set datatypes for lift table
    df = df.astype({
        "liftID": 'category',
        "resortID": 'category',
        "liftName": 'category',
        "status": status_cat_dtype,
        "timeToRide": "int"
    })
    df["timestamp"] = pd.to_datetime(df["timestamp"])

    return df


def jsons_to_df(jsons, record_path):
    df = pd.DataFrame.from_dict(pd.json_normalize(
        jsons, record_path=record_path, meta='timestamp'))
    df = set_lifts_df_datatypes(df)
    return df


def load_merged_json_as_df(merged_file, record_path):
    # load the merged json as a dataframe
    with open(merged_file, "r") as f:
        d = json.load(f)
        df = jsons_to_df(d, record_path)
        return df


def get_status_changes(df, keep_oldest=False):
    """
    Filter out rows that do not represent a status change.

    Parameters
    ----------
    df : pandas.DataFrame
        Includes 'status' and timestamp columns.  Lists status of every lift for each timestamp.
    keep_oldest : boolean
        Indicates if the returned DataFrame should keep the oldest status for each lift even if
        a lift has no status changes.  This is so that the earliest status for each lift is not
        lost, and all lifts are listed the returned DataFrame even if their status has not
        changed.  Use `False` when there is just one DataFrame to process.  Use `True` is cases
        where the status chages will be appended to an existing dataframe that already has at
        least one row for each lift.

    Returns
    -------
    pandas.DataFrame
        Only includes the rows from the original dataframe where there was a change to a new
        status.
    """

    def calc_status_change(df, keep_oldest=keep_oldest):
        change_rows = df[df.status.ne(df.status.shift())]

        if keep_oldest:
            firstrow = df.loc[df['timestamp'].idxmin()]
            keep_df = firstrow.to_frame().T.append(change_rows)
        else:
            keep_df = change_rows

        # Remove so that we don't need to write another column to S3 as we scrape?
        # Just calculate it when plotting and predicting?
        # keep_df['time_diff'] = keep_df['timestamp'].diff(1).shift(-1)

        return keep_df

    df = df.groupby('liftName', group_keys=False)\
           .apply(calc_status_change)\
           .reset_index(drop=True)

    df = set_lifts_df_datatypes(df)

    return df

# TBD: may need to convert timestamp to days (e.g. for Tableau)

### Test code

In [None]:
test_df = load_merged_json_as_df(merged_test_file, 'lifts')
test_df.sort_values(by=['liftID', 'timestamp'])

In [6]:
# TBD: test that len is 1 when keep_oldest=False and larger otherwise
get_status_changes(test_df, keep_oldest=False).sort_values(by=['liftID', 'timestamp'])

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,69,13,Blackcomb Gondola Lower,O,7,2020-01-03 00:19:09.631011-08:00


# Process lift json fies

In [None]:
merge_matching_jsons(suffix="lifts.json", save_file=merged_file)

In [250]:
lifts_df = load_merged_json_as_df(merged_file, 'lifts')

lifts_status_changes_df = get_status_changes(lifts_df, keep_oldest=True)

**NOTE:** `timeToRide` is just the time is takes to ride the lift, not the current wait time:

In [251]:
lifts_df.groupby("liftName")['timeToRide'].unique()

liftName
7th Heaven Express                 [6]
Big Red Express                    [8]
Blackcomb Gondola Lower            [7]
Blackcomb Gondola Upper            [7]
Catskinner Express                 [4]
Coca-Cola Tube Park                [4]
Creekside Gondola                  [7]
Crystal Ridge Express              [7]
Emerald 6 Express                  [6]
Excalibur Gondola Lower            [3]
Excalibur Gondola Upper            [5]
Excelerator Express                [6]
Fitzsimmons Express                [6]
Franz's Chair                      [8]
Garbanzo Express                   [7]
Glacier Express                    [6]
Harmony 6 Express                  [6]
Horstman T-Bar                     [4]
Jersey Cream Express               [5]
Magic Chair                        [6]
Olympic Chair                      [5]
Peak 2 Peak Gondola               [12]
Peak Express                       [3]
Showcase T-Bar                     [3]
Symphony Express                   [7]
T-Bars          

In [276]:
lifts_df

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,69,13,Blackcomb Gondola Lower,X,7,2020-01-03 00:19:09.631011-08:00
1,70,13,Blackcomb Gondola Upper,X,7,2020-01-03 00:19:09.631011-08:00
2,5,13,Excalibur Gondola Lower,X,3,2020-01-03 00:19:09.631011-08:00
3,71,13,Excalibur Gondola Upper,X,5,2020-01-03 00:19:09.631011-08:00
4,8,13,Excelerator Express,X,6,2020-01-03 00:19:09.631011-08:00
...,...,...,...,...,...,...
84559,22,13,Peak 2 Peak Gondola,X,12,2020-02-03 10:30:26.557898-08:00
84560,44,13,Franz's Chair,X,8,2020-02-03 10:30:26.557898-08:00
84561,43,13,Peak Express,X,3,2020-02-03 10:30:26.557898-08:00
84562,42,13,Symphony Express,X,7,2020-02-03 10:30:26.557898-08:00


In [16]:
def get_status_durations(lifts_df):
    '''Calculate values and add columns for the time difference between the
    timestamp for the current status and the timestamp for the next status
    for each lift:
    `time_diff` column: Gives the duration that the lift was in the status indicated in the `status` column.
    `time_diff_seconds` column: `time_diff` converted to seconds.
    
    lifts_status_changes_df should be TBD
    '''
    # TBD: optimize if needed via # 3 under:
    # https://towardsdatascience.com/pandas-tips-and-tricks-33bcc8a40bb9
    df = lifts_df.sort_values(by=['liftID', 'timestamp'])
    df['time_diff'] = df.groupby('liftID')['timestamp'].diff(1).shift(-1)

    # Fill in the durations which will be missing for the most recent status changes
    missing_time_diffs_idx = df.loc[(df['time_diff'].isnull()) & (
        df['timestamp'] >= df['timestamp'].min()), 'timestamp'].index.values

    df.loc[missing_time_diffs_idx, 'time_diff'] = df['timestamp'].max(
    ) - df.loc[missing_time_diffs_idx, 'timestamp']

    # Convert to seconds
    df['time_diff_seconds'] = df['time_diff'].dt.total_seconds()

    return df

In [253]:
df = get_status_durations(lifts_status_changes_df)

# Uses local date formatting, otherwise Tableau will mix up month and day
# alternatively, can export to json:
# lifts_status_changes_df.to_json(DATA_DIR + "lifts_status_changes.json", orient='table')
df.to_csv(DATA_DIR + "lifts_status_changes.csv", date_format='%c')


In [17]:
# add:
# 
# daily: for each chair calculate most open status of the day: O > H > X
# Days since each chair was last seen open with timestamp of most recent open time.
# snowfall since last open
# save data for other mountains

# Storage options testing

In [None]:
df.to_pickle(DATA_DIR + "df_test.pkl")

In [None]:
from fastparquet import write

# parquet engines don't handle shifted timezones
import pytz
TZ = pytz.timezone('America/Vancouver')
df['timestamp'] = df.timestamp.dt.tz_convert(pytz.utc)

In [None]:
# Note: May need snappy-python as a req to run on AWS Lambda
df.to_parquet(DATA_DIR + "df_test.parquet", engine='fastparquet')

In [None]:
load_df = pd.read_parquet(DATA_DIR + "df_test.parquet")
load_df['timestamp'] = load_df.timestamp.dt.tz_convert(TZ) # convert back to correct timezone


In [None]:
#TBD convert back to correct datatypes
load_df.dtypes

In [None]:
df.to_csv(DATA_DIR + "df_test.csv")

Test file size results:
- json: 800 Kb?
- csv: 474 Kb
- pickle: 145 Kb
- parquet: 15 Kb

## Delta Lake Testing

Requires apache spark instance.  For future use, could set one up to work with lambda using https://aws.amazon.com/emr/features/spark/?

Otherwise databricks (similar to QxMD project)

# json comparison and parquet to S3

In [119]:
from fastparquet import write, ParquetFile
import os
import pytz
import s3fs
import botocore

os.chdir("../src/data/snowbot_AWS_lambda/")
from scrape import get_data
os.chdir("../../../notebooks")

CPU times: user 15 µs, sys: 254 µs, total: 269 µs
Wall time: 696 µs


In [8]:
fs = s3fs.S3FileSystem()
myopen = fs.open
nop = lambda *args, **kwargs: None

HISTORY_FNAME = 'wb_lifts_history.parquet'
PRIOR_STATUS_FNAME = 'lifts_prior.json'

In [10]:
def write_dataframe_to_parquet_on_s3(df, fname):
    """ Write a dataframe to a Parquet file on S3.  Creates a new parquet file if one
    doesn't already exist.
    """

    def write_parquet(df, fname, app=True):

        output_file = f"s3://{BUCKET_NAME}/{fname}"
        write(output_file,
              df,
              # partition_on=['timestamp'],
              file_scheme='hive',
              append=app,  # need to remove or catch exception to work when file doesn't exist
              open_with=myopen,
              mkdirs=nop)
        print(f"Writing {len(df)} records to {fname}.")

    # Unshift the timezone because parquet engines don't handle shifted timezones
    df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(pytz.utc)

    s3_object = s3.Object(BUCKET_NAME, fname)

    if not list(bucket.objects.filter(Prefix=fname)):
        print(f"File {fname} not found.  Creating new file.")
        # Keep oldest status for each lift because creating new file
        df = get_status_changes(df, keep_oldest=True)
        write_parquet(df, fname, app=False)

    else:
        print(f"File {fname} found.")
        df = get_status_changes(df, keep_oldest=False)
        write_parquet(df, fname, app=True)

## Generalized version
For all data from the EpicMix API

In [220]:
resortID=1
if resortID:
    print("yes")

yes


In [232]:
import requests
from datetime import datetime

# Filter for a specific resort
def filter_resort(data, resortID=None):
    if resortID:
        return data["resortID"] == resortID
    else:
        return data

def get_data_all_TEST():
    """new version: Defaults to all resorts.  Otion to filter"""
    API_URL = 'http://www.epicmix.com/vailresorts/sites/epicmix/api/mobile/'
    DATA_LIST = {'lifts': 'lifts', 'weather': 'snowconditions', 'terrain': 'terrains'}  # keys are used in the requests, the values and used in the response
    json_data = dict()

    for d, name in DATA_LIST.items():
        res = requests.get(API_URL + d + '.ashx')
        res.raise_for_status()
        data = json.loads(res.text)[name]
        data = list(filter(filter_resort, data))
        json_data[d] = json.dumps({'timestamp': str(datetime.now(TZ)), d: data})

    return json_data

In [235]:
HISTORY_SUFFIX = '_history_DEV.parquet'
PRIOR_SUFFIX = '_prior_DEV.json'


class ParquetWriter():
    """Identifies new data and writes it to Parquet file on S3."""

    def __init__(self):
        # Get current data
        self.data_current_all = get_data_all_TEST()  # String.

    def write_new_data_all(self):
        """Writes new data for each type (i.e. 'lifts', 'weather', 'terrian')
        of data returned by the API.
        """
        # self.table is the type of data
        for self.table in self.data_current_all:
            self.current_json = json.loads(self.data_current_all[self.table])
            self.prior_fname = self.table + PRIOR_SUFFIX
            self.prior_object = s3.Object(BUCKET_NAME, self.prior_fname)
            self.write_new_data()

    def write_new_data(self):
        """If new data since the last update of Parquet file is found, add it to the Parquet
        file.  Save the current data as json to serve as the prior in the next comparison.
        """

        # Get prior data json
        try:
            self.prior_object.load()
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("Prior doesn't exist")
                # Create the prior file
                self.save_prior_data()
                print(f"Created {self.prior_fname}")
            else:
                # Something else has gone wrong.
                raise
        else:
            # The prior data file exists
            self.get_prior_data()
            if self.data_changed():

                # Get a df with the status chages between the prior and current json data
                df = jsons_to_df([self.prior_json, self.current_json], record_path=self.table)
                write_dataframe_to_parquet_on_s3(df, self.table + HISTORY_SUFFIX)

                # save current data json as prior
                self.save_prior_data()
                print(
                    f"Replaced data in {self.prior_object.key} with current data.")

    def get_prior_data(self):
        prior = self.prior_object.get()[
            'Body'].read().decode('utf-8')
        self.prior_json = json.loads(prior)
        print(f"Loaded prior {self.table} json data from S3")

    def data_changed(self):
        """Compare current data json with prior data json without their timestamps.  The timestamps
        on the current json will always be more recent even when none of the other data has changed.
        """
        if self.prior_json[self.table] == self.current_json[self.table]:
            print(
                f"No differences between current and prior {self.table} data were found.")
            return False
        else:
            print(
                f"Found differences between current and prior {self.table} data.")
            return True

    def save_prior_data(self):
        """Save the current data as prior data on S3."""
        bucket.put_object(Key=self.prior_fname,
                          Body=bytes(json.dumps(self.current_json).encode('UTF-8')))

In [236]:
pr = ParquetWriter()
pr.write_new_data_all()

Loaded prior lifts json data from S3
Found differences between current and prior lifts data.
File lifts_history_DEV.parquet found.


  index_cols = [{'name': index_cols.name, 'start': index_cols._start,
  'stop': index_cols._stop, 'step': index_cols._step,
  'stop': index_cols._stop, 'step': index_cols._step,


Writing 22 records to lifts_history_DEV.parquet.
Replaced data in lifts_prior_DEV.json with current data.
Loaded prior weather json data from S3
Found differences between current and prior weather data.


KeyError: 'Only a column name can be used for the key in a dtype mappings argument.'

In [206]:
parq_df = load_dataframe_from_parquet_on_s3('lifts' + HISTORY_SUFFIX)

lifts_status_changes_parq_df = get_status_durations(parq_df)
lifts_status_changes_parq_df.to_csv(DATA_DIR + "lifts_status_changes_parq.csv", date_format='%c')
lifts_status_changes_parq_df

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp,time_diff,time_diff_seconds
0,3,13,7th Heaven Express,O,6,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
18,4,13,Jersey Cream Express,O,5,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
9,5,13,Excalibur Gondola Lower,O,3,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
19,6,13,Magic Chair,O,6,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
15,7,13,Glacier Express,O,6,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
11,8,13,Excelerator Express,O,6,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
4,9,13,Catskinner Express,O,4,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
7,10,13,Crystal Ridge Express,O,7,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
17,11,13,Horstman T-Bar,X,4,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553
23,12,13,Showcase T-Bar,O,3,2020-02-04 14:38:49.276014-08:00,00:09:56.128553,596.128553


In [207]:
parq_df.dtypes

liftID                                 category
resortID                               category
liftName                               category
status                                 category
timeToRide                                int64
timestamp     datetime64[ns, America/Vancouver]
dtype: object

In [164]:
s3.Object(BUCKET_NAME, 'lifts' + PRIOR_SUFFIX).delete()
s3.Object(BUCKET_NAME, 'terrain' + PRIOR_SUFFIX).delete()
s3.Object(BUCKET_NAME, 'weather' + PRIOR_SUFFIX).delete()

{'ResponseMetadata': {'RequestId': '776FA24CA4BB91C5',
  'HostId': 'AoNqwBsoH++T97P4H/+75E0tIrsQ22yNWJAt0mrCHf2pNROay9afsm5ZB6jToAzx4YOtebd06yc=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'AoNqwBsoH++T97P4H/+75E0tIrsQ22yNWJAt0mrCHf2pNROay9afsm5ZB6jToAzx4YOtebd06yc=',
   'x-amz-request-id': '776FA24CA4BB91C5',
   'date': 'Tue, 04 Feb 2020 22:10:42 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

## Original Version
Handles lifts only

In [47]:
class ParquetWriter():
    """Identifies new data and writes it to Parquet file on S3."""

    def __init__(self):
        # Get current lift status info json
        lifts_current = get_data()['lifts']  # String.
        self.lifts_current_json = json.loads(lifts_current)

        self.lifts_prior_object = s3.Object(BUCKET_NAME, PRIOR_STATUS_FNAME)

    def write_new_data(self):
        """If new data since the last update of Parquet file is found, add it to the Parquet
        file.  Save the current data as json to serve as the prior in the next comparison.
        """

        # Get prior lift status info json
        try:
            self.lifts_prior_object.load()
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("Prior doesn't exist")
                # Create the prior file
                self.save_prior_data()
                print(f"Created {PRIOR_STATUS_FNAME}")
            else:
                # Something else has gone wrong.
                raise
        else:
            # The prior exists
            self.get_prior_data()
            if self.data_changed():

                # Get a df with the status chages between the prior and current json data
                df = jsons_to_df([self.lifts_prior_json, self.lifts_current_json])
                write_dataframe_to_parquet_on_s3(df, HISTORY_FNAME)

                # save current lift status info json as prior
                self.save_prior_data()
                print(
                    f"Replaced data in {self.lifts_prior_object.key} with current data.")

    def get_prior_data(self):
        lifts_prior = self.lifts_prior_object.get()[
            'Body'].read().decode('utf-8')
        self.lifts_prior_json = json.loads(lifts_prior)
        print("Loaded prior json data from S3")

    def data_changed(self):
        """Compare current data json with prior data json without their timestamps.  The timestamps
        on the current json will always be more recent even when none of the lift statuses have changed.
        """
        if self.lifts_prior_json['lifts'] == self.lifts_current_json['lifts']:
            print("No differences between current and prior data were found.")
            return False
        else:
            print("Found differences between current and prior data.")
            return True

    def save_prior_data(self):
        """Save the current data as prior data on S3."""
        bucket.put_object(Key=PRIOR_STATUS_FNAME,
                          Body=bytes(json.dumps(self.lifts_current_json).encode('UTF-8')))

In [117]:
ParquetWriter().write_new_data()

Loaded prior json data from S3
Found differences between current and prior data.
File wb_lifts_history.parquet found.


  index_cols = [{'name': index_cols.name, 'start': index_cols._start,
  'stop': index_cols._stop, 'step': index_cols._step,
  'stop': index_cols._stop, 'step': index_cols._step,


Writing 14 records to wb_lifts_history.parquet.
Replaced data in lifts_prior.json with current data.


**Warnings**

See https://github.com/dask/fastparquet/issues/477 for fastparquet warnings about `RangeIndex._start, RangeIndex._stop, RangeIndex._step`


    /Users/paul/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:90: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:655: FutureWarning: RangeIndex._start is deprecated and will be removed in a future version. Use RangeIndex.start instead
      index_cols = [{'name': index_cols.name, 'start': index_cols._start,
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:656: FutureWarning: RangeIndex._stop is deprecated and will be removed in a future version. Use RangeIndex.stop instead
      'stop': index_cols._stop, 'step': index_cols._step,
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:656: FutureWarning: RangeIndex._step is deprecated and will be removed in a future version. Use RangeIndex.step instead
      'stop': index_cols._stop, 'step': index_cols._step,

In [182]:
def s3_object_exists(fname):
    """Check if an s3 object exists.  Returns `True` if the object exists."""
    try:
        s3.Object(BUCKET_NAME, fname)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print(f"{fname} doesn't exist")
        else:
            raise
    return True


def load_dataframe_from_parquet_on_s3(fname):
    """ Load a dataframe from a Parquet file on S3. """
    if s3_object_exists(fname):
        read_file = f"s3://{BUCKET_NAME}/{fname}"
        pf = ParquetFile(read_file, open_with=myopen)
        df = pf.to_pandas()

        # Reshift the timezone because parquet engines don't handle shifted timezones
        df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(TZ)

        return df

In [200]:
parq_df = load_dataframe_from_parquet_on_s3(HISTORY_FNAME)

lifts_status_changes_parq_df = get_status_durations(parq_df)
lifts_status_changes_parq_df.to_csv(DATA_DIR + "lifts_status_changes_parq.csv", date_format='%c')
lifts_status_changes_parq_df

In [202]:
lifts_status_changes_parq_df.sort_values('timestamp')

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp,time_diff,time_diff_seconds
0,3,13,7th Heaven Express,O,6,2020-02-03 14:21:57.064742-08:00,00:47:48.055160,2868.055160
23,12,13,Showcase T-Bar,O,3,2020-02-03 14:21:57.064742-08:00,00:47:48.055160,2868.055160
26,41,13,T-Bars,O,5,2020-02-03 14:21:57.064742-08:00,00:47:48.055160,2868.055160
2,69,13,Blackcomb Gondola Lower,O,7,2020-02-03 14:21:57.064742-08:00,01:15:33.492083,4533.492083
21,22,13,Peak 2 Peak Gondola,X,12,2020-02-03 14:21:57.064742-08:00,18:49:17.304856,67757.304856
...,...,...,...,...,...,...,...,...
73,42,13,Symphony Express,O,7,2020-02-04 10:51:07.675750-08:00,03:47:41.600264,13661.600264
74,45,13,Fitzsimmons Express,X,6,2020-02-04 11:11:03.627551-08:00,03:30:00.349858,12600.349858
75,14,13,Coca-Cola Tube Park,O,4,2020-02-04 12:41:10.947905-08:00,01:59:53.029504,7193.029504
76,42,13,Symphony Express,X,7,2020-02-04 14:38:49.276014-08:00,00:02:14.701395,134.701395


In [124]:
parq_df.status.cat.categories

Index(['X', 'H', 'O'], dtype='object')

In [125]:
print(*parq_df.status)

O O O O O O O O O O O O X X O O O X O O O X O O O X O O O X X X X X X X X X X X X X X X X X X X X O O O O O O O O O O O O O O O O O O O O O O O O O X O


In [186]:
parq_df.sort_values(["liftName", "timestamp"])

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,3,13,7th Heaven Express,O,6,2020-02-03 14:21:57.064742-08:00
29,3,13,7th Heaven Express,X,6,2020-02-03 15:09:45.119902-08:00
67,3,13,7th Heaven Express,O,6,2020-02-04 09:21:14.155051-08:00
1,36,13,Big Red Express,O,8,2020-02-03 14:21:57.064742-08:00
35,36,13,Big Red Express,X,8,2020-02-03 15:37:30.556825-08:00
...,...,...,...,...,...,...
47,33,13,Whistler Village Gondola Lower,X,5,2020-02-03 15:37:30.556825-08:00
65,33,13,Whistler Village Gondola Lower,O,5,2020-02-04 09:11:14.369598-08:00
28,72,13,Whistler Village Gondola Upper,O,11,2020-02-03 14:21:57.064742-08:00
48,72,13,Whistler Village Gondola Upper,X,11,2020-02-03 15:37:30.556825-08:00


### Issue when running get_status_durations(parq_df)
Resulting in error:

    ~/anaconda3/lib/python3.7/site-packages/pandas/core/arrays/categorical.py in from_codes(cls, codes, categories, ordered, dtype)
        705 
        706         if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
    --> 707             raise ValueError("codes need to be between -1 and " "len(categories)-1")
        708 
        709         return cls(codes, dtype=dtype, fastpath=True)

    ValueError: codes need to be between -1 and len(categories)-1


Same error seen when running `parq_df[['status']].sort_values(by=['status'])`

This was caused by missing categories (`H`) in the `status` column (and maybe others)

#### Code to inspect issue:

In [228]:
# Test for issue
parq_df[['status']].sort_values(by=['status'])

Unnamed: 0,status
0,H
59,H
58,H
57,H
56,H
...,...
38,O
39,O
40,O
42,O


In [280]:
parq_df.status.cat.categories

Index(['X', 'H', 'O'], dtype='object')

In [281]:
print(*parq_df.status.cat.codes)

2 2 2 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 0 2 0 2 0 2 2 2


In [205]:
len(parq_df.status.cat.codes)

65

In [185]:
# Should be false
parq_df.status.cat.codes.max() >= len(parq_df.status.dtype.categories)

False

In [186]:
# Should be false
parq_df.liftName.cat.codes.min() < -1

False

In [None]:
for c in parq_df.columns:
    print(parq_df[c].cat.categories)

In [36]:
parq_df.dtypes

liftID                   category
resortID                 category
liftName                 category
status                   category
timeToRide                  int64
timestamp     datetime64[ns, UTC]
dtype: object

In [113]:
lifts_status_changes_df.dtypes

liftID                                      category
resortID                                    category
liftName                                    category
status                                      category
timeToRide                                     int64
timestamp     datetime64[ns, pytz.FixedOffset(-480)]
dtype: object

In [106]:
parq_df["timestamp"] = pd.to_datetime(pd.Series(np.asarray(parq_df["timestamp"])))

In [73]:
read_file = f"s3://{BUCKET_NAME}/{fname}.parquet"
pf = ParquetFile(read_file, open_with=myopen)

# Check the categories for a specific row group
pf.grab_cats(columns='status', row_group_index=1)

{'status': array(['H', 'O', 'X'], dtype=object)}

In [66]:
# If partitioning by column, gives known values for each column
pf.cats

{}

#### Possible solutions
1. Remove partitioning by date column when writing to parquet
2. Set status categories manually via `set_categories`. (and any other columns with the same issue.  See https://github.com/dask/dask/issues/2944
3. Leave problem columns as text-based when writing and loading from parquet

# Testing timestamps for file loading

In [9]:
read_file = f"s3://{BUCKET_NAME}/{fname}.parquet"
pf = ParquetFile(read_file, open_with=myopen)
test = pf.to_pandas()["timestamp"]

In [100]:
# If needed: to convert for categorical datetime to regular datetime
df["timestamp"] = pd.to_datetime(pd.Series(np.asarray(df["timestamp"])))

	To accept the future behavior, pass 'dtype=object'.
	To keep the old behavior, pass 'dtype="datetime64[ns]"'.
  


In [101]:
test.dt = test.dt.tz_convert(tz= 'America/Vancouver')

/Users/paul/anaconda3/lib/python3.7/site-packages/pandas/core/series.py:597: FutureWarning: Converting timezone-aware DatetimeArray to timezone-naive ndarray with 'datetime64[ns]' dtype. In the future, this will return an ndarray with 'object' dtype where each element is a 'pandas.Timestamp' with the correct 'tz'.
	To accept the future behavior, pass 'dtype=object'.
	To keep the old behavior, pass 'dtype="datetime64[ns]"'.


more info: https://pandas-docs.github.io/pandas-docs-travis/whatsnew/v0.24.0.html#converting-timezone-aware-series-and-index-to-numpy-arrays

In [102]:
load_dataframe_from_parquet_on_s3(fname).dtypes

liftID        category
resortID      category
liftName      category
status        category
timeToRide       int64
timestamp     category
dtype: object

### Testing local parquet saves

In [None]:
def save_parquet(df, fname):
    # parquet engines don't handle shifted timezones
    df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(pytz.utc)

    # Note: May need snappy-python as a req to run on AWS Lambda
    df.to_parquet(DATA_DIR + fname + '.parquet',
                  engine='fastparquet',
                  partition_on=['timestamp'],
                  file_scheme='mixed')

In [91]:
save_parquet(df[0:3].copy(), 'wb_lifts_history')

In [92]:
df.iloc[20:22, :].copy().to_parquet(DATA_DIR + 'wb_lifts_history' + '.parquet',
              engine='fastparquet',
              partition_on=['timestamp'],
              file_scheme='mixed',
              append=True)
# Catch exception that is doesn't exist here

In [None]:
# todo: change time_diff to "duration"
# test on lambda
# make datatype dict for and general set datatypes function


# Utilities

In [118]:
def del_lifts_history():
    s3.Object(BUCKET_NAME, HISTORY_FNAME).delete()

def del_lifts_prior():
    s3.Object(BUCKET_NAME, PRIOR_STATUS_FNAME).delete()

In [121]:
del_lifts_history()
del_lifts_prior()