# Module Code

In [1]:
%load_ext autoreload
# always reload modules so that as you change code in src, it gets loaded
%autoreload 2

import json, glob, boto3, os
import pdb
import pandas as pd
from fastparquet import write, ParquetFile
import pytz
import s3fs
import botocore
from collections import Iterable
from typing import List, Union
from copy import deepcopy
import warnings
import time
from src.data import get_data

  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


In [3]:
session = boto3.Session()
BUCKET_NAME = 'snowbot-pv'

# S3 Connect
s3 = session.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)

In [4]:
# parquet engines don't handle shifted timezones
TZ = pytz.timezone('America/Vancouver')

DATA_DIR = "../data/"
MERGED_JSON_FILENAME = "merged_file.json"
merged_json_file = DATA_DIR + MERGED_JSON_FILENAME

# Used for weather data in jsons_to_df()
weather_meta_fields = [
    'newSnow', 'last24Hours', 'last48Hours', 'last7Days', 'midMountainBase',
    'resortID'
]
weather_record_path = ['weather', 'weatherForecast']
weather_meta = [['weather', i] for i in weather_meta_fields]
weather_meta.append('timestamp')

# Used for lift and terrain status in jsons_to_df()
# Important to set categories because when writing incrementally to parquet, some increments
# may not include all statuses.  Manually setting the categories avoids errors due to
# different catergory indexing between increments.
status_cat_dtype = pd.api.types.CategoricalDtype(categories=['X', 'H', 'O'],
                                                 ordered=True)
groomed_cat_dtype = pd.api.types.CategoricalDtype(categories=['No', 'Yes'],
                                                  ordered=True)

# Column dtypes that are to be set for each dataframe
df_dtypes = {
    "lifts": {
        'liftID': 'int8',
        'resortID': 'int8',
        'liftName': 'object',
        'status': status_cat_dtype,
        'timeToRide': 'int8'
    },
    'terrain': {
        'runID': 'int16',
        'resortID': 'int8',
        'groomed': groomed_cat_dtype,
        'runName': 'object',
        'runType': 'int8',
        'status': status_cat_dtype,
        'terrainName': 'object'
    },
    'weather': {
        'resortID': 'int8',
        'forecast.dayDescription': 'object',
        'forecast.daycode': 'int8',
        'forecast.forecastString': 'object',
        'forecast.iconName': 'object',
        'forecast.summaryDescription': 'object',
        'forecast.temperatureHigh': 'int8',
        'forecast.temperatureLow': 'int8',
        'weather.last24Hours': 'int8',
        'weather.last48Hours': 'int8',
        'weather.last7Days': 'int8',
        'weather.midMountainBase': 'int16',
        'weather.newSnow': 'int8'
    }
}


def flatten(items):
    """Yield items from any nested iterable"""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten(x):
                yield sub_x
        else:
            yield x


# The columns that serve to identify records for each topic
topic_ID_col_names = {
    'lifts': ['resortID', 'liftName'],
    'terrain': ['resortID', 'runID', 'terrainName'],
    'weather': 'resortID',
    'all_topics': 'timestamp'
}
# All of the column names that serve to identify records in at least one of the topics
all_ID_col_names = set(flatten(topic_ID_col_names.values()))

In [5]:
# from https://alexwlchan.net/2019/07/listing-s3-keys/
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    kwargs = {'Bucket': bucket}

    # We can pass the prefix directly to the S3 API.  If the user has passed
    # a tuple or list of prefixes, we go through them one by one.
    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    for key_prefix in prefixes:
        kwargs["Prefix"] = key_prefix

        for page in paginator.paginate(**kwargs):
            try:
                contents = page["Contents"]
            except KeyError:
                return

            for obj in contents:
                key = obj["Key"]
                if key.endswith(suffix):
                    yield obj


def get_matching_s3_keys(bucket, prefix="", suffix=""):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj["Key"]


def merge_matching_jsons_on_s3(save_file, prefix="", suffix=""):
    """Merges json files on S3 that match the suffix into a new json and save it
    as the save_file on S3."""

    result = []

    for f in get_matching_s3_keys(BUCKET_NAME, prefix=prefix, suffix=suffix):
        # TBD: more efficient to go straight to df w/o saving json to file?

        # Write the file from S3 into a local temp file
        with open('temp', 'wb') as tfw:
            bucket.download_fileobj(f, tfw)

        # Append the local temp file into the result list
        with open('temp', 'rb') as tfr:
            result.append(json.load(tfr))

    os.remove("temp")

    # Fill the output file with the merged content
    with open(save_file, "w") as outfile:
        json.dump(result, outfile)


def set_df_datatypes(df, topic):
    """Set the datatypes for a df according to the topic that
    it represents."""
    df = df.astype(df_dtypes[topic])
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    return df


def jsons_to_df(jsons, record_path, meta='timestamp'):
    """Convert a json containing one or more timestamps to a dataframe."""
    if record_path == 'weather':
        # Deal with the nested object that the weather data uses to store the weather forecast
        df = pd.json_normalize(jsons, record_path=weather_record_path,
                               meta=weather_meta, record_prefix='forecast.')
        df.rename(columns={"weather.resortID": "resortID"}, inplace=True)
    else:
        df = pd.json_normalize(jsons, record_path=record_path,
                               meta=meta)

    df = set_df_datatypes(df, record_path)
    return df


def load_json_as_df(merged_json_file, record_path):
    """Load json file containing one or more timestamps as a dataframe."""
    with open(merged_json_file, "r") as f:
        d = json.load(f)
        df = jsons_to_df(d, record_path)
        return df


def get_data_changes(df, topic, keep_oldest=False):
    """
    Filter out rows that do not represent changed data.

    Parameters
    ----------
    df : pandas.DataFrame
        Includes 'timestamp' identifying and data columns.  Lists data for each timestamp.
    keep_oldest : boolean
        Indicates if the returned DataFrame should keep the oldest record for each entity (i.e.
        lift, resort, tor terrain) even if an entity has no data changes.  This is so that the
        earliest data for each entity is not lost, and all entities are listed the returned DataFrame
        even if their data has not changed.  Use `False` when there is just one DataFrame to process.
        Use `True` is cases where the data changes will be appended to an existing dataframe that
        already has at least one row for each entity.

    Returns
    -------
    pandas.DataFrame
        Only includes the rows from the original dataframe where there was a change to new values
        in the data columns.
    """
    ID_columns = topic_ID_col_names[topic]
    data_columns = [c for c in df.columns if c not in all_ID_col_names]

    def filter_for_data_changes(df, keep_oldest=keep_oldest):
        """Filter out rows where data is unchanged for adjacent timestamps.
        Required to handle cases when there are > 2 rows per entity.
        """
        # TBD: optimize via slice_shift(), which doesn't copy data,
        # instead of shift()?
        keep_idx = df[data_columns].ne(df[data_columns].shift()).any(
            axis=1).values[1:]  # True for rows with data changes
        changed_rows = df.reset_index(drop=True).drop(index=0)[keep_idx]

        if keep_oldest:
            firstrow = df.loc[df['timestamp'].idxmin()]
            keep_df = firstrow.to_frame().T.append(changed_rows)
        else:
            keep_df = changed_rows

        return keep_df

    # Drop any rows that are complete duplicates so that conditional evaluation will
    # work.  This is required for Peak 2 Peak Gondola because it is duplicated in the
    # lifts data.  Maybe others as well.
    df.drop_duplicates(inplace=True)

    # 1 means that there were up to 2 rows found per group
    if df.groupby(ID_columns, group_keys=False).cumcount().max() < 2:
        # Most efficient method.  Only works if there are 2 or less rows per entity.
        subset = df.columns.drop('timestamp')
        df = df.sort_values('timestamp')

        if keep_oldest:
            df = df.drop_duplicates(subset=subset, keep='first')
        else:
            df = df.drop_duplicates(subset=subset, keep=False)
            df = df.drop_duplicates(subset=ID_columns, keep='last')

    else:
        # Less efficient method.  Required if there are > 2 rows per entity.
        df = df.sort_values('timestamp').groupby(ID_columns, group_keys=False)\
               .apply(filter_for_data_changes)\
               .reset_index(drop=True)

    records_are_unique(df, include_timestamp_in_colnames(ID_columns))
    
    # TBD: may not be neccessary if this will already have be done on the input df
    # Remove to optimize?
    df = set_df_datatypes(df, topic)

    return df


def records_are_unique(df: pd.DataFrame, record_id_cols: List[str]) -> bool:
    """Check if records in df can be uniquely identified using record_id_cols
    and raise warning if they are not."""
    df_indexed = df.set_index(record_id_cols)
    are_unique = df_indexed.index.is_unique
    if not are_unique:
        warnings.warn(
            f"\nSome records can not be uniquely identified using {record_id_cols}"
            f"\n{df_indexed[df_indexed.index.duplicated(keep = False)]}"
        )
    return are_unique


def include_timestamp_in_colnames(col_names: Union[List[str], str]) -> List[str]:
    """Returns a list of strings which includes 'timestamp' in addition to the list
    or sting given for `col_names`.
    
    >>> include_timestamp_in_colnames(topic_ID_col_names['terrain'])
    ['resortID', 'runID', 'terrainName', 'timestamp']
    """
    col_names = deepcopy(col_names)
    if type(col_names) == str : col_names = [col_names]
    col_names.extend(['timestamp'])
    return col_names

# Process lift json fies

In [6]:
def get_status_durations(lifts_df):
    '''Calculate values and add columns for the time difference between the
    timestamp for the current status and the timestamp for the next status
    for each lift:
    `time_diff` column: Gives the duration that the lift was in the status indicated in the `status` column.
    `time_diff_seconds` column: `time_diff` converted to seconds.
    '''
    # TBD: optimize if needed via # 3 under:
    # https://towardsdatascience.com/pandas-tips-and-tricks-33bcc8a40bb9
    record_id_cols = include_timestamp_in_colnames(topic_ID_col_names['lifts'])
    df = lifts_df.sort_values(by=record_id_cols)
    df['time_diff'] = df.groupby(topic_ID_col_names['lifts'])['timestamp'].diff(1).shift(-1)

    # Fill in the durations which will be missing for the most recent status changes
    missing_time_diffs_idx = df.loc[(df['time_diff'].isnull()) & (
        df['timestamp'] >= df['timestamp'].min()), 'timestamp'].index.values

    df.loc[missing_time_diffs_idx, 'time_diff'] = df['timestamp'].max(
    ) - df.loc[missing_time_diffs_idx, 'timestamp']

    # Convert to seconds
    df['time_diff_seconds'] = df['time_diff'].dt.total_seconds()

    return df

## Whistler Lifts

In [None]:
merge_matching_jsons_on_s3(suffix="lifts.json", save_file=merged_json_file)

In [None]:
whis_lifts_df = load_json_as_df(merged_json_file, 'lifts')

whis_lifts_status_changes_df = get_data_changes(whis_lifts_df, 'lifts', keep_oldest=True)

In [None]:
lifts_status_changes_df

**NOTE:** `timeToRide` is just the time is takes to ride the lift, not the current wait time:

In [None]:
whis_lifts_df.groupby("liftName")['timeToRide'].unique()

In [None]:
whis_lifts_df

In [None]:
whis_lifts_df = get_status_durations(whis_lifts_status_changes_df)

# Uses local date formatting, otherwise Tableau will mix up month and day
# alternatively, can export to json:
# lifts_status_changes_df.to_json(DATA_DIR + "lifts_status_changes.json", orient='table')
whis_lifts_df.to_csv(DATA_DIR + "whis_lifts_status_changes.csv", date_format='%c')


## Loop through Whistler JSON files for all topics

In [None]:
for topic in ['lifts', 'terrain', 'weather']:
    merge_matching_jsons_on_s3(
        suffix=topic + ".json", save_file=merged_json_file)
    df = load_json_as_df(merged_json_file, topic)
    status_changes_df = get_data_changes(df, topic, keep_oldest=True)

    if topic == 'lifts':
        get_status_durations(status_changes_df).to_csv(
            DATA_DIR + 'whis_lifts_status_changes.csv', date_format='%c')
    else:
        status_changes_df.to_csv(
            DATA_DIR + 'whis_' + topic + '_status_changes.csv', date_format='%c')

# Storage options testing

In [None]:
df.to_pickle(DATA_DIR + "df_test.pkl")

In [None]:
from fastparquet import write

# parquet engines don't handle shifted timezones
import pytz
TZ = pytz.timezone('America/Vancouver')
df['timestamp'] = df.timestamp.dt.tz_convert(pytz.utc)

In [None]:
# Note: May need snappy-python as a req to run on AWS Lambda
df.to_parquet(DATA_DIR + "df_test.parquet", engine='fastparquet')

In [None]:
load_df = pd.read_parquet(DATA_DIR + "df_test.parquet")
load_df['timestamp'] = load_df.timestamp.dt.tz_convert(TZ) # convert back to correct timezone


In [None]:
#TBD convert back to correct datatypes
load_df.dtypes

In [None]:
df.to_csv(DATA_DIR + "df_test.csv")

Test file size results:
- json: 800 Kb?
- csv: 474 Kb
- pickle: 145 Kb
- parquet: 15 Kb

## Delta Lake Notes

Requires apache spark instance.  For future use, could set one up to work with lambda using https://aws.amazon.com/emr/features/spark/?

Otherwise databricks (similar to QxMD project)

# Parquet on S3

For all topics from the EpicMix API.  Compare most recent topic data from json on S3 and if the data has changes, append the changes to parquet file on S3.

### Module Code

In [7]:
import requests
from datetime import datetime
fs = s3fs.S3FileSystem()
myopen = fs.open
nop = lambda *args, **kwargs: None

HISTORY_SUFFIX = '_history_DEV.parquet'
PRIOR_SUFFIX = '_prior_DEV.json'


def write_dataframe_to_parquet_on_s3(df, topic, fname):
    """ Write a dataframe to a Parquet file on S3.  Creates a new parquet file if one
    doesn't already exist.
    """

    def write_parquet(df, fname, app=True):

        output_file = f"s3://{BUCKET_NAME}/{fname}"
        write(output_file,
              df,
              # partition_on=['timestamp'],
              file_scheme='hive',
              append=app,  # need to remove or catch exception to work when file doesn't exist
              open_with=myopen,
              mkdirs=nop)
        print(f"Writing {len(df)} records to {fname}.")

    # Unshift the timezone because parquet engines don't handle shifted timezones
    df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(pytz.utc)

    s3_object = bucket.Object(fname)

    if not list(bucket.objects.filter(Prefix=fname)):
        print(f"File {fname} not found.  Creating new file.")
        # Keep oldest record for each entity because creating new file
        df = get_data_changes(df, topic=topic, keep_oldest=True)
        write_parquet(df, fname, app=False)

    else:
        print(f"File {fname} found on S3.")
        df = get_data_changes(df, topic=topic, keep_oldest=False)
        write_parquet(df, fname, app=True)


def filter_resort(data, resortID: int = None) -> dict:
    """Filter for a specific resort."""
    if resortID:
        return data["resortID"] == resortID
    else:
        return data


def get_data(filter_topic: Union[str, List] = None, filter_resortID: int = None) -> dict:
    """Get data from EpicMix API. Defaults to all resorts.  Option to filter for a
    specific resort or topic.
    """
    API_URL = 'http://www.epicmix.com/vailresorts/sites/epicmix/api/mobile/'
    # keys are used in the requests, the values and used in the response
    DATA_LIST = {'lifts': 'lifts',
                 'weather': 'snowconditions', 'terrain': 'terrains'}
    json_data = dict()

    # Create lists to filter by topic
    if filter_topic is not None:
        filtered_data_list = {k: v for k,
                              v in DATA_LIST.items() if k in filter_topic}
    else:
        filtered_data_list = DATA_LIST

    for d, name in filtered_data_list.items():
        res = requests.get(API_URL + d + '.ashx')
        res.raise_for_status()
        data = json.loads(res.text)[name]
        data = list(filter(lambda x: filter_resort(x, filter_resortID), data))
        json_data[d] = json.dumps(
            {'timestamp': str(datetime.now(TZ)), d: data})

    return json_data


def s3_object_exists(fname):
    """Check if an s3 object exists.  Returns `True` if the object exists."""
    try:
        bucket.Object(fname)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print(f"{fname} doesn't exist")
        else:
            raise
    return True


def load_dataframe_from_parquet_on_s3(fname):
    """ Load a dataframe from a Parquet file on S3. """
    if s3_object_exists(fname):
        read_file = f"s3://{BUCKET_NAME}/{fname}"
        pf = ParquetFile(read_file, open_with=myopen)
        df = pf.to_pandas()

        # Reshift the timezone because parquet engines don't handle shifted timezones
        df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(TZ)

        return df


class ApiData():
    def __init__(self, topic: str, current_json: str):
        self.topic = topic
        self.current_json = current_json
        # May not exist yet
        self.prior_fname = topic + PRIOR_SUFFIX
        self.prior_object = bucket.Object(self.prior_fname)
        self.check_prior_object()

    def check_prior_object(self):
        """Get prior data json"""
        try:
            self.prior_object.load()
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print(f"Prior json for {self.topic} doesn't exist")
                self.prior_exists = False
            else:
                # Something else has gone wrong.
                raise
        else:
            self.prior_exists = True
        return self.prior_exists

    def get_prior_data_json(self):
        """Get prior data json from S3."""
        if self.prior_exists == True:
            prior = self.prior_object.get()['Body'].read().decode('utf-8')
            self.prior_json = json.loads(prior)
            print(f"Loaded prior {self.topic} json data from S3")
            return self.prior_json
        else:
            print(f"Prior json for {self.topic} doesn't exist")

    def data_changed(self):
        """Compare current data json with prior data json without their timestamps.  The timestamps
        on the current json will always be more recent even when none of the other data has changed.
        """
        if self.prior_json[self.topic] == self.current_json[self.topic]:
            print(
                f"No differences between current and prior {self.topic} data were found.")
            return False
        else:
            print(
                f"Found differences between current and prior {self.topic} data.")
            # Check for changed keys using the first example in each list
            self.compare_dict_keys_recusive(
                self.prior_json[self.topic][0], self.current_json[self.topic][0])
            return True

    def save_prior_data(self):
        """Save the current data as prior data on S3."""
        bucket.put_object(Key=self.prior_fname,
                          Body=bytes(json.dumps(self.current_json).encode('UTF-8')))

    @staticmethod
    def compare_dict_keys_recusive(prior_dict: dict, curr_dict: dict) -> None:
        """Compare keys of two dictionaries and raise warning if they have been changed.
        Recursively checks keys of nested dictionaries.  Nested dictionaries are expected
        to be stored within lists.
        """
        if prior_dict.keys() - curr_dict.keys():
            warnings.warn(
                f'keys were changed:'
                f'\nOriginal keys: {prior_dict.keys()}'
                f'\n\tRemoved keys: {prior_dict.keys() - curr_dict.keys()}'
                f'\n\tAdded keys: {curr_dict.keys() - prior_dict.keys()}\n'
            )
        # Compare keys of nested dicts (using the prior dict)
        for k, v in prior_dict.items():
            if isinstance(v, list):
                ApiData.compare_dict_keys_recusive(v[0], curr_dict[k][0])


class ParquetWriter():
    """Identifies new data and writes it to Parquet file on S3."""

    def __init__(self):
        # Get current data
        self.data_current_all = get_data()  # String.

    def write_new_data_all(self):
        """Writes new data for each type (i.e. 'lift', 'weather', 'terrian')
        of data returned by the API.
        """
        for topic in self.data_current_all:
            current_json = json.loads(self.data_current_all[topic])
            data = ApiData(topic, current_json)
            self.write_new_data(data)

    def write_new_data(self, ApiData):
        """If current data has changed since the last update of Parquet file is, add it
        to the Parquet file.  Save the current data as json to serve as the prior for
        the next comparison.
        """

        if ApiData.prior_exists:
            ApiData.get_prior_data_json()
            if ApiData.data_changed():
                # Get a df with the chages between the prior and current json data
                df = jsons_to_df(
                    [ApiData.prior_json, ApiData.current_json], record_path=ApiData.topic)
                write_dataframe_to_parquet_on_s3(
                    df, ApiData.topic, ApiData.topic + HISTORY_SUFFIX)

                # save current data json as prior
                ApiData.save_prior_data()
                print(
                    f"Replaced data in {ApiData.prior_object.key} with current data.")
        else:
            print(f"Prior json for {ApiData.topic} doesn't exist")
            # Create the prior file
            ApiData.save_prior_data()
            print(f"Created {ApiData.prior_fname}")
        print('\n')

In [10]:
%%time
# Update parquet files once
pr = ParquetWriter()
pr.write_new_data_all()

Loaded prior lifts json data from S3
Found differences between current and prior lifts data.
File lifts_history_DEV.parquet found on S3.


Some records can not be uniquely identified using ['resortID', 'liftName', 'timestamp']
                                                                liftID status  \
resortID liftName             timestamp                                         
15       Silver Queen Express 2020-02-26 15:49:42.527076+00:00      11      X   
                              2020-02-26 15:49:42.527076+00:00      11      O   

                                                                timeToRide  
resortID liftName             timestamp                                     
15       Silver Queen Express 2020-02-26 15:49:42.527076+00:00           7  
                              2020-02-26 15:49:42.527076+00:00           7  
  index_cols = [{'name': index_cols.name, 'start': index_cols._start,
  'stop': index_cols._stop, 'step': index_cols._step,
  'stop': index_cols._stop, 'step': index_cols._step,


Writing 107 records to lifts_history_DEV.parquet.
Replaced data in lifts_prior_DEV.json with current data.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 15 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Loaded prior terrain json data from S3
Found differences between current and prior terrain data.
File terrain_history_DEV.parquet found on S3.
Writing 498 records to terrain_history_DEV.parquet.
Replaced data in terrain_prior_DEV.json with current data.


CPU times: user 2.81 s, sys: 207 ms, total: 3.02 s
Wall time: 7.03 s


In [37]:
# Update parquet loop
while True:
    print('\n\n\n' + time.ctime() + ':\n---------------------')
    pr = ParquetWriter()
    pr.write_new_data_all()
    time.sleep(1800)




Thu Feb 20 21:09:50 2020:
---------------------
Loaded prior lifts json data from S3
Found differences between current and prior lifts data.
File lifts_history_DEV.parquet found on S3.


Some records can not be uniquely identified using ['resortID', 'liftName', 'timestamp']
                                                                liftID status  \
resortID liftName             timestamp                                         
15       Silver Queen Express 2020-02-21 05:09:51.508695+00:00      11      X   
                              2020-02-21 05:09:51.508695+00:00      11      O   

                                                                timeToRide  
resortID liftName             timestamp                                     
15       Silver Queen Express 2020-02-21 05:09:51.508695+00:00           7  
                              2020-02-21 05:09:51.508695+00:00           7  
  index_cols = [{'name': index_cols.name, 'start': index_cols._start,
  'stop': index_cols._stop, 'step': index_cols._step,
  'stop': index_cols._stop, 'step': index_cols._step,


Writing 118 records to lifts_history_DEV.parquet.
Replaced data in lifts_prior_DEV.json with current data.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 15 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Loaded prior terrain json data from S3
Found differences between current and prior terrain data.
File terrain_history_DEV.parquet found on S3.
Writing 98 records to terrain_history_DEV.parquet.
Replaced data in terrain_prior_DEV.json with current data.





Thu Feb 20 21:39:59 2020:
---------------------
Loaded prior lifts json data from S3
No differences between current and prior lifts data were found.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 4 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV

Some records can not be uniquely identified using ['resortID', 'liftName', 'timestamp']
                                                                liftID status  \
resortID liftName             timestamp                                         
15       Silver Queen Express 2020-02-21 14:40:00.192507+00:00      11      X   
                              2020-02-21 14:40:00.192507+00:00      11      O   

                                                                timeToRide  
resortID liftName             timestamp                                     
15       Silver Queen Express 2020-02-21 14:40:00.192507+00:00           7  
                              2020-02-21 14:40:00.192507+00:00           7  
  index_cols = [{'name': index_cols.name, 'start': index_cols._start,
  'stop': index_cols._stop, 'step': index_cols._step,
  'stop': index_cols._stop, 'step': index_cols._step,


Writing 33 records to lifts_history_DEV.parquet.
Replaced data in lifts_prior_DEV.json with current data.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 15 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Loaded prior terrain json data from S3
Found differences between current and prior terrain data.
File terrain_history_DEV.parquet found on S3.
Writing 168 records to terrain_history_DEV.parquet.
Replaced data in terrain_prior_DEV.json with current data.





Fri Feb 21 07:10:07 2020:
---------------------
Loaded prior lifts json data from S3
Found differences between current and prior lifts data.
File lifts_history_DEV.parquet found on S3.


Some records can not be uniquely identified using ['resortID', 'liftName', 'timestamp']
                                                                liftID status  \
resortID liftName             timestamp                                         
15       Silver Queen Express 2020-02-21 15:10:08.127436+00:00      11      X   
                              2020-02-21 15:10:08.127436+00:00      11      O   

                                                                timeToRide  
resortID liftName             timestamp                                     
15       Silver Queen Express 2020-02-21 15:10:08.127436+00:00           7  
                              2020-02-21 15:10:08.127436+00:00           7  


Writing 4 records to lifts_history_DEV.parquet.
Replaced data in lifts_prior_DEV.json with current data.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 9 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Loaded prior terrain json data from S3
Found differences between current and prior terrain data.
File terrain_history_DEV.parquet found on S3.
Writing 38 records to terrain_history_DEV.parquet.
Replaced data in terrain_prior_DEV.json with current data.





Fri Feb 21 07:40:17 2020:
---------------------
Loaded prior lifts json data from S3
Found differences between current and prior lifts data.
File lifts_history_DEV.parquet found on S3.


Some records can not be uniquely identified using ['resortID', 'liftName', 'timestamp']
                                                                liftID status  \
resortID liftName             timestamp                                         
15       Silver Queen Express 2020-02-21 15:40:18.383650+00:00      11      X   
                              2020-02-21 15:40:18.383650+00:00      11      O   

                                                                timeToRide  
resortID liftName             timestamp                                     
15       Silver Queen Express 2020-02-21 15:40:18.383650+00:00           7  
                              2020-02-21 15:40:18.383650+00:00           7  


Writing 49 records to lifts_history_DEV.parquet.
Replaced data in lifts_prior_DEV.json with current data.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 3 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Loaded prior terrain json data from S3
Found differences between current and prior terrain data.
File terrain_history_DEV.parquet found on S3.
Writing 40 records to terrain_history_DEV.parquet.
Replaced data in terrain_prior_DEV.json with current data.




KeyboardInterrupt: 

**Warnings**

See https://github.com/dask/fastparquet/issues/477 for fastparquet warnings about `RangeIndex._start, RangeIndex._stop, RangeIndex._step`


    /Users/paul/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:90: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:655: FutureWarning: RangeIndex._start is deprecated and will be removed in a future version. Use RangeIndex.start instead
      index_cols = [{'name': index_cols.name, 'start': index_cols._start,
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:656: FutureWarning: RangeIndex._stop is deprecated and will be removed in a future version. Use RangeIndex.stop instead
      'stop': index_cols._stop, 'step': index_cols._step,
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:656: FutureWarning: RangeIndex._step is deprecated and will be removed in a future version. Use RangeIndex.step instead
      'stop': index_cols._stop, 'step': index_cols._step,

## Load lifts data from parquet and save as .csv

In [None]:
parq_df = load_dataframe_from_parquet_on_s3('lifts' + HISTORY_SUFFIX)
# Needed because the `ordered` attribute of categorical dtypes gets reset to
# False after loading from the parquet file
parq_df = set_df_datatypes(parq_df, 'lifts')
lifts_status_changes_parq_df = get_status_durations(parq_df)
lifts_status_changes_parq_df.to_csv(
    DATA_DIR + "lifts_status_changes_parq.csv", date_format='%c')

# Utilities

In [14]:
def save_parquet(df, fname):
    """Save df to (local) parquet file.

    >>> save_parquet(df[0:3].copy(), 'wb_lifts_history')"""
    # parquet engines don't handle shifted timezones
    df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(pytz.utc)

    # Note: May need snappy-python as a req to run on AWS Lambda
    df.to_parquet(DATA_DIR + fname + '.parquet',
                  engine='fastparquet',
                  partition_on=['timestamp'],
                  file_scheme='mixed')


def load_prior_json_from_s3(topic: str) -> dict:
    """>>> load_prior_json_from_s3('weather')"""
    prior_object = bucket.Object(topic + PRIOR_SUFFIX)
    prior = prior_object.get()['Body'].read().decode('utf-8')
    return json.loads(prior)


def dataframe_difference(df1, df2, which=None):
    """Find rows which are different between two DataFrames.
    Based on https://hackersandslackers.com/compare-rows-pandas-dataframes/"""
    comparison_df = df1.merge(df2,
                              indicator=True,
                              how='outer')
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]
    return diff_df

### S3 Object Deletion

In [None]:
# Delete prior jsons
bucket.Object('lifts' + PRIOR_SUFFIX).delete()
bucket.Object('terrain' + PRIOR_SUFFIX).delete()
bucket.Object('weather' + PRIOR_SUFFIX).delete()

In [None]:
# Delete parquet history files
bucket.objects.filter(Prefix='lifts' + HISTORY_SUFFIX + '/').delete()
bucket.objects.filter(Prefix='terrain' + HISTORY_SUFFIX + '/').delete()
bucket.objects.filter(Prefix='weather' + HISTORY_SUFFIX + '/').delete()

In [None]:
def del_lifts_history():
    bucket.Object(HISTORY_FNAME).delete()

def del_lifts_prior():
    bucket.Object(PRIOR_STATUS_FNAME).delete()

In [None]:
del_lifts_history()
del_lifts_prior()

# Testing

In [15]:
TEST_DATA_DIR = "../data/test/"
TEST_VALIDATION_DATA_DIR = TEST_DATA_DIR + "valid/"
lifts_json_test_file = TEST_DATA_DIR + "lifts_test.json"
terrain_json_test_file = TEST_DATA_DIR + "terrain_test.json"
weather_json_test_file = TEST_DATA_DIR + "weather_test.json"
merged_lifts_json_test_file = TEST_DATA_DIR + "merged_lifts_test.json"
merged_terrain_json_test_file = TEST_DATA_DIR + "merged_terrain_test.json"
merged_weather_json_test_file = TEST_DATA_DIR + "merged_weather_test.json"
merged_whis_lifts_json_test_file = TEST_DATA_DIR + "merged_whis_lifts_test.json"

### Find column combinations to identify entities

We can uniquely identify each run via a combination of: `resortID`, `runID` and `terrainName`:

In [None]:
from itertools import combinations

# Move to EDA notebook
# Do we really need to use all the terrain columns in topic_ID_col_names to uniquely identify each run?
ID_cols = ['resortID', 'runID', 'runName', 'terrainName']
df = load_json_as_df(terrain_json_test_file, 'terrain')

for combo in combinations(ID_cols, len(ID_cols)-1):
    print(f"Combo: {combo}\tDuplicates: {df.duplicated(combo).sum()}")
    
print(f"Combo: {ID_cols}\tDuplicates: {df.duplicated(ID_cols).sum()}")

We will always get one duplicate lift entry because the Peak 2 Peak Gondola is returned twice in the API data:

In [None]:
ID_cols = ['resortID', 'liftName', 'liftID']
df = load_json_as_df(lifts_json_test_file, 'lifts')

In [None]:
for combo in combinations(ID_cols, len(ID_cols)-1):
    print(f"Combo: {combo}\tDuplicates: {df.duplicated(combo).sum()}")

print(f"Combo: {ID_cols}\tDuplicates: {df.duplicated(ID_cols).sum()}")

There are run IDs that repeat for the same resort (e.g. for Vail resortID == 1, runID == 10)

### Tests

In [11]:
import unittest
import pytest

In [21]:
# new

# TBD add test descriptions?


class TestNotebook(unittest.TestCase):

    topic_names = ['lifts', 'terrain', 'weather']

    def test_get_data(self):
        self.assertEqual(
            list(get_data(filter_topic=['lifts']).keys()), ['lifts'],
            'Returned dictionary was not filtered for the right topic'
        )
        self.assertEqual(
            list(get_data(filter_topic=['lifts', 'terrain']).keys()),
            ['lifts', 'terrain'],
            'Returned dictionary was not filtered for the right topics'
        )
        self.assertEqual(
            set(get_data().keys()), {'lifts', 'weather', 'terrain'},
            'Returned dictionary was not filtered for all topics'
        )

    def test_get_data_changes(self):
        """This will also test that the datatypes match those used in the validation dataframes
        that are loaded from files."""

        # 'lifts' needs to be listed twice under 'topic' because there are two tests
        # run on lift data.  The test using merged_whis_lifts_json_test_file has more than
        # 2 timepoints in order to test the special code that is used to handle that case.
        tests_df = pd.DataFrame({
            'test_file': [merged_whis_lifts_json_test_file,
                          merged_lifts_json_test_file,
                          merged_terrain_json_test_file,
                          merged_weather_json_test_file],
            'topic': ['lifts', 'lifts', 'terrain', 'weather'],
            'validation_fname_prefix': ['get_data_changes_merged_whis_lifts',
                                        'get_data_changes_merged_lifts',
                                        'get_data_changes_merged_terrain',
                                        'get_data_changes_merged_weather']
        })

        for row in tests_df.iterrows():

            test_file = row[1]['test_file']
            df = load_json_as_df(test_file, row[1]['topic'])
            df = df.sample(frac=1)  # Shuffle the data

            for keep_oldest in [True, False]:
                if keep_oldest == True:
                    validation_fname_suffix = '_keep_oldest_valid.pkl'
                else:
                    validation_fname_suffix = '_drop_oldest_valid.pkl'

                tested_df = get_data_changes(
                    df, row[1]['topic'], keep_oldest=keep_oldest)

                valid_file = TEST_VALIDATION_DATA_DIR + \
                    row[1]['validation_fname_prefix'] + validation_fname_suffix
                valid_df = pd.read_pickle(valid_file)

                #  Sort and reindex before comparison because we are not testing the indexes
                # or row orders that the functions return.
                tested_df.sort_values(
                    tested_df.columns.to_list(), ignore_index=True, inplace=True)
                valid_df.sort_values(
                    valid_df.columns.to_list(), ignore_index=True, inplace=True)

                pd.testing.assert_frame_equal(
                    tested_df, valid_df,
                    f"Result from {test_file} did not match validation dataframe {valid_file}."
                )

    def test_ID_col_names(self):
        """Make sure that records can be uniquely identified by using ID columns for each topic
        (in combination with timestamp)"""
        files = [merged_lifts_json_test_file,
                 merged_terrain_json_test_file, merged_weather_json_test_file]

        for file, topic in zip(files, self.topic_names):

            df = load_json_as_df(file, topic)

            # Drop any rows that are complete duplicates. This is required for the Peak 2 Peak
            # Gondola because it is duplicated in the lifts data.  Maybe others as well.
            df.drop_duplicates(inplace=True)

            record_id_cols = include_timestamp_in_colnames(
                topic_ID_col_names[topic])
            self.assertTrue(records_are_unique(df, record_id_cols),
                            f"{record_id_cols} are not sufficient to uniquely identify the {topic} records.")

    def test_ApiData_compare_dict_keys_recusive(self):

        # Creating some test data
        def change_dict_keys():
            """Replaces one of the keys in each outer dict"""
            files = [merged_lifts_json_test_file,
                     merged_terrain_json_test_file, merged_weather_json_test_file]

            for file, topic in zip(files, self.topic_names):
                with open(file, "r") as f:
                    topic_dict = json.load(f)[0][topic][0]
                    first_key = list(topic_dict.keys())[0]
                    # Rename one of the dictionary keys
                    topic_dict['newkey'] = topic_dict.pop(first_key)
                    yield topic_dict

        changed_dicts = list(change_dict_keys())
        # Change key in the nested weather dict
        changed_dicts[2]['weatherForecast'][0]['newkey'] = changed_dicts[2]['weatherForecast'][0].pop(
            'temperatureLow')

        # Run the tests
        for topic, changed_dict in zip(self.topic_names, changed_dicts):
            # TBD: use a saved file for test case
            prior_dict = load_prior_json_from_s3(topic)[topic][0]
            with self.assertWarns(UserWarning, msg='Warning should be raised for dicts with different keys'):
                ApiData.compare_dict_keys_recusive(prior_dict, changed_dict)
                
        # Make sure no warning is raised if the dict keys are the same
        with pytest.warns(None) as record:
            ApiData.compare_dict_keys_recusive(prior_dict, prior_dict)
        assert len(record) == 0, 'No warning should be raised if dicts have the same keys.'


# To add:
# test records_are_unique() raises warning
    # Assert df has no NaN or NaTs:
    # assert parq_df.isnull().sum().sum() == 0
unittest.main(argv=[''], verbosity=2, exit=False)

test_ApiData_compare_dict_keys_recusive (__main__.TestNotebook) ... ok
test_ID_col_names (__main__.TestNotebook)
Make sure that records can be uniquely identified by using ID columns for each topic ... ok
test_get_data (__main__.TestNotebook) ... ok
test_get_data_changes (__main__.TestNotebook)
This will also test that the datatypes match those used in the validation dataframes ... ok

----------------------------------------------------------------------
Ran 4 tests in 3.036s

OK


<unittest.main.TestProgram at 0x11c904240>

**To Do: testing**
- no duplicate rows
- no duplicate information in adjacent rows by time

### Test that the status categories are complete for lifts and terrain

In [None]:
topic = 'terrain'
test_df = load_dataframe_from_parquet_on_s3(topic + HISTORY_SUFFIX)

In [None]:
assert test_df.status.dtype == status_cat_dtype
assert test_df.groomed.dtype == groomed_cat_dtype

This may happen because the ordered attribute is not preserved after loading from Parquet:

In [None]:
test_df.status.dtype

In [None]:
status_cat_dtype

In [None]:
groomed_cat_dtype

Fixed via `set_df_datatypes` 

In [None]:
test_df = set_df_datatypes(test_df, topic)
assert test_df.status.dtype == status_cat_dtype
assert test_df.groomed.dtype == groomed_cat_dtype

It looks like this could be an issue with fasparquet to_pandas() because the `_metadata` file does store `"ordered": true`.  Can adapt code from fastparquet test_read.py to submit issue if needed:

    def test_grab_cats(tempdir):
        s = pd.Series(['a', 'c', 'b']*20)
        df = pd.DataFrame({'a': s, 'b': s.astype('category'),
                           'c': s.astype('category').cat.as_ordered()})
        fastparquet.write(tempdir, df, file_scheme='hive')
        pf = fastparquet.ParquetFile(tempdir)
        cats = pf.grab_cats(['b', 'c'])
        assert (cats['b'] == df.b.cat.categories).all()
        assert (cats['c'] == df.c.cat.categories).all()
        
Refer to `test_statistics()` also

## Issue when running get_status_durations(parq_df)
Resulting in error:

    ~/anaconda3/lib/python3.7/site-packages/pandas/core/arrays/categorical.py in from_codes(cls, codes, categories, ordered, dtype)
        705 
        706         if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
    --> 707             raise ValueError("codes need to be between -1 and " "len(categories)-1")
        708 
        709         return cls(codes, dtype=dtype, fastpath=True)

    ValueError: codes need to be between -1 and len(categories)-1


Same error seen when running `parq_df[['status']].sort_values(by=['status'])`

This was caused by missing categories (`H`) in the `status` column (and maybe others)

### Code to inspect issue:

In [None]:
# Test for issue by sorting:
# will raise `ValueError: codes need to be between -1 and len(categories)-1`
parq_df[['status']].sort_values(by=['status'])

In [None]:
parq_df.status.cat.categories

In [None]:
# All the category codes present in the column
print(*parq_df.status.cat.codes.unique())

In [None]:
len(parq_df.status.cat.codes)

In [None]:
# Should be false
parq_df.status.cat.codes.max() >= len(parq_df.status.dtype.categories)

In [None]:
# Should be false
parq_df.liftName.cat.codes.min() < -1

In [None]:
for c in parq_df.columns:
    print(parq_df[c].cat.categories)

In [110]:
parq_df["timestamp"] = pd.to_datetime(pd.Series(np.asarray(parq_df["timestamp"])))

### Working on the issue for other categorical columns


In [196]:
parq_fname = '../data/test/lifts_history_diff_category_values.parquet'
pf = ParquetFile(read_file, open_with=myopen)
parq_df = pf.to_pandas()

Error will be encountered when sorting by a column that has the issue:

In [173]:
for c in ['liftID', 'resortID', 'liftName', 'status', 'timeToRide', 'timestamp']:
    parq_df = pf.to_pandas([c])
    print(f"sorting for {c}...")
    parq_df.sort_values(c)
    #print(parq_df)

sorting for liftID...
sorting for resortID...
sorting for liftName...


ValueError: codes need to be between -1 and len(categories)-1

The highest category code in the dataframe column should be greater than or equal to the number of category codes used in the column. 

`pd.Categorical.from_codes()` expects codes as sequential numbers starting from 0 up to `len(categories) - 1`

In [23]:
# The highest category code in the dataframe column
parq_df.liftName.cat.codes.max()

318

In [46]:
# The number of category codes used in the dataframe column
len(parq_df.liftName.cat.codes.unique())

319

In [160]:
# The number of category values used in the dataframe column
len(parq_df.liftName.dtype.categories.values)

318

The problem is that Dask, **fastparquet**, pyarrow, and **pandas** don't currently have a way to specify the categorical dtype of a column split across many files. Each file (parition) is treated independently. This results in categorials with unknown categories in the Pandas DataFrame. If we know that the categories are all the same, we're able to read in the first files categories and assign those to the entire DataFrame. But this is a bit fragile, as it relies on an assumption not necessarily guaranteed by the file structure.  If, for example, a new lifName is added, then the new partitions will contain a new category for the lift name but the old partitions will not.  https://tomaugspurger.github.io/sklearn-dask-tabular.html

In [174]:
# Columns marked as categorical in the extra metadata
# (meaning the data must have come from pandas).
pf.categories

{'liftID': 50, 'resortID': 15, 'liftName': 320, 'status': 3}

In [170]:
print(pf.schema.text)

- schema: 
| - liftID: INT64, OPTIONAL
| - resortID: INT64, OPTIONAL
| - liftName: BYTE_ARRAY, UTF8, OPTIONAL
| - status: BYTE_ARRAY, UTF8, OPTIONAL
| - timeToRide: BYTE_ARRAY, UTF8, OPTIONAL
  - timestamp: INT64, TIMESTAMP_MICROS, OPTIONAL


In [171]:
# Check the number of categories in the ParquetFile
import numpy as np
pf_cats = np.unique(pf.grab_cats('liftName')['liftName'])
len(pf_cats)

319

In [72]:
# Check the number of categories in the dataframe made from the ParquetFile
df_cats = np.unique(parq_df.liftName.dtype.categories.values)
len(df_cats)

318

In [153]:
# Which categories were only in the ParquetFile?
set(pf_cats).difference(set(df_cats))

{'#12 Magic Carpet 1',
 '#8 Red Cliffs Tow',
 'Golden Peak T-Bar #16',
 'Quicksilver Gondola - North',
 'Quicksilver Gondola - South',
 'Train Rider Lift'}

In [152]:
# Which categories were only in the dataframe made from the ParquetFile?
set(df_cats).difference(set(pf_cats))

{'#12 Magic Carpet 1 (Ski School)',
 'Chairlift #10',
 'Chairlift #6',
 'Quicksilver Gondola',
 'Trail Rider'}

In [148]:
# Check to see if the same categories are used across all the ParquetFile partitions
prev_cats = None

for rg_idx in range(len(pf.row_groups) - 1):
    cats = pf.grab_cats('liftName', rg_idx)['liftName']
    if prev_cats is not None:
        if np.array_equal(cats, prev_cats):
            print(f"Row group {rg_idx}'s categories are equal to row group {rg_idx-1}'s")
        else:
            print(f"\nRow group {rg_idx}'s categories are NOT equal to row group {rg_idx-1}'s")
            print(f"Unique values in row group {rg_idx} that are not in row group {rg_idx-1}:")
            print(f"\t{np.setdiff1d(cats, prev_cats)}\n")
    prev_cats = cats

Row group 1's categories are equal to row group 0's
Row group 2's categories are equal to row group 1's
Row group 3's categories are equal to row group 2's
Row group 4's categories are equal to row group 3's
Row group 5's categories are equal to row group 4's
Row group 6's categories are equal to row group 5's
Row group 7's categories are equal to row group 6's
Row group 8's categories are equal to row group 7's
Row group 9's categories are equal to row group 8's
Row group 10's categories are equal to row group 9's
Row group 11's categories are equal to row group 10's
Row group 12's categories are equal to row group 11's

Row group 13's categories are NOT equal to row group 12's
Unique values in row group 13 that are not in row group 12:
	['#12 Magic Carpet 1 (Ski School)']


Row group 14's categories are NOT equal to row group 13's
Unique values in row group 14 that are not in row group 13:
	[]

Row group 15's categories are equal to row group 14's
Row group 16's categories are equal 

In [None]:
# Things tried that crashed the kernel
for i in parq_df['liftName'].items():
    print(i)

parq_df['liftName'].to_string()
parq_df['liftName'].tolist()
parq_df['liftName'].astype('str')
set(parq_df.liftName.values)

### Possible solutions
1. Remove partitioning by date column when writing to parquet
2. **Set status categories manually via `set_categories`. (and any other columns with the same issue.  See https://github.com/dask/dask/issues/2944**
3. **Leave other columns as int and objects when writing and loading from parquet**
4. Manually specify number of categories when converting to df.  E.g. `parq_df = pf.to_pandas(['liftName'], categories={'liftName': 319})` (didn't work)
5. Catch exception and load column as string instead?
6. repartition and reload?... probably can't be done via fastparquet because the same issue will be encountered when trying to save the loaded dataframe to a new parquet file.
7. Process each row group to rebuild the list of category values from the values that appear in the rows?  Using ParquetFile.iter_row_groups() convert the col dtype to object for each row-group dataframe, then join them into one dataframe and convert the column dtype to categorical.
8. Maintain a list of category values for each column.  Before adding any data to history parquet file, check to make sure that there are not any new category values for the column, and if there are, add them to the categorical dtype

# Testing timestamps for file loading
Just notes from figuring out how to do this

In [None]:
read_file = f"s3://{BUCKET_NAME}/{fname}.parquet"
pf = ParquetFile(read_file, open_with=myopen)
test = pf.to_pandas()["timestamp"]

In [None]:
# If needed: to convert for categorical datetime to regular datetime
df["timestamp"] = pd.to_datetime(pd.Series(np.asarray(df["timestamp"])))

In [None]:
test.dt = test.dt.tz_convert(tz= 'America/Vancouver')

In [157]:
len(parq_df.liftName.dtype.categories)

318

/Users/paul/anaconda3/lib/python3.7/site-packages/pandas/core/series.py:597: FutureWarning: Converting timezone-aware DatetimeArray to timezone-naive ndarray with 'datetime64[ns]' dtype. In the future, this will return an ndarray with 'object' dtype where each element is a 'pandas.Timestamp' with the correct 'tz'.
	To accept the future behavior, pass 'dtype=object'.
	To keep the old behavior, pass 'dtype="datetime64[ns]"'.


more info: https://pandas-docs.github.io/pandas-docs-travis/whatsnew/v0.24.0.html#converting-timezone-aware-series-and-index-to-numpy-arrays

In [72]:
#
len(parq_df.liftName.cat.codes.unique())

319

In [None]:
load_dataframe_from_parquet_on_s3(fname).dtypes

# Example Code

## Load live whistler parquet data

In [151]:
%time parq_df = load_dataframe_from_parquet_on_s3('wb_lifts_history.parquet')

lifts_status_changes_parq_df = get_status_durations(parq_df)
lifts_status_changes_parq_df.to_csv(DATA_DIR + "lifts_status_changes_parq.csv", date_format='%c')
lifts_status_changes_parq_df

CPU times: user 1.9 s, sys: 71.1 ms, total: 1.97 s
Wall time: 19.2 s


Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp,time_diff,time_diff_seconds
0,3,13,7th Heaven Express,O,6,2020-02-03 14:21:57.064742-08:00,00:47:48.055160,2868.055160
29,3,13,7th Heaven Express,X,6,2020-02-03 15:09:45.119902-08:00,18:11:29.035149,65489.035149
67,3,13,7th Heaven Express,O,6,2020-02-04 09:21:14.155051-08:00,05:49:50.203591,20990.203591
79,3,13,7th Heaven Express,X,6,2020-02-04 15:11:04.358642-08:00,18:39:59.624647,67199.624647
140,3,13,7th Heaven Express,O,6,2020-02-05 09:51:03.983289-08:00,05:19:59.834930,19199.834930
...,...,...,...,...,...,...,...,...
1110,72,13,Whistler Village Gondola Upper,O,11,2020-02-19 08:21:03.877305-08:00,07:19:59.983405,26399.983405
1145,72,13,Whistler Village Gondola Upper,X,11,2020-02-19 15:41:03.860710-08:00,15:50:00.289791,57000.289791
1157,72,13,Whistler Village Gondola Upper,H,11,2020-02-20 07:31:04.150501-08:00,00:49:59.863014,2999.863014
1180,72,13,Whistler Village Gondola Upper,O,11,2020-02-20 08:21:04.013515-08:00,07:19:59.546428,26399.546428


## Other

In [None]:
# load a json from S3
terrain_prior_json = load_prior_json_from_s3('terrain')

# Convert json to a dataframe normally wer can use jsons_to_df()
terrain_prior_df = pd.json_normalize(
    data=terrain_prior_json,
    record_path=['terrain'],
    meta='timestamp'
)

terrain_prior_df.columns

In [None]:
# Download a parquet file from S3
for f in get_matching_s3_keys(BUCKET_NAME, prefix='lifts_history_DEV'):
    print(f)
    s3.meta.client.download_file(BUCKET_NAME, f, '../data/test/' + f)

In [144]:
# Load parquet file from S3 into dataframe
read_file = f"s3://{BUCKET_NAME}/{'lifts' + HISTORY_SUFFIX}"
pf = ParquetFile(read_file, open_with=myopen)
parq_df = pf.to_pandas()

In [357]:
# Update testing validation files when df_dtypes has been changed
for f, topic in zip(['get_data_changes_merged_whis_lifts',
                     'get_data_changes_merged_lifts',
                     'get_data_changes_merged_terrain',
                     'get_data_changes_merged_weather'],
                    ['lifts', 'lifts', 'terrain', 'weather']):
    for suffix in ['_drop_oldest', '_keep_oldest']:
        df = pd.read_pickle(TEST_VALIDATION_DATA_DIR +
                            f + suffix + '_valid.pkl')
        df = set_df_datatypes(df, topic)
        df.to_pickle(TEST_VALIDATION_DATA_DIR + f + suffix + '_valid.pkl')

In [155]:

topic = 'terrain'
test_df = load_dataframe_from_parquet_on_s3(topic + HISTORY_SUFFIX)

# Sort by timestamp and ID columns
record_id_cols = include_timestamp_in_colnames(topic_ID_col_names[topic])
print(f"sorted by {record_id_cols}")
test_df.sort_values(record_id_cols)

sorted by ['resortID', 'runID', 'terrainName', 'timestamp']


Unnamed: 0_level_0,runID,resortID,groomed,runName,runType,status,terrainName,timestamp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
127,10,1,No,Apres Vous,3,O,Back Bowls,2020-02-19 15:23:08.416876-08:00
171,10,1,Yes,Big Rock Park,2,O,Blue Sky Basin,2020-02-19 15:23:08.416876-08:00
153,10,1,No,Bolshoi Ballroom,3,O,China Bowl,2020-02-19 15:23:08.416876-08:00
0,10,1,No,Blue Ox,3,O,Golden Peak,2020-02-19 15:23:08.416876-08:00
87,10,1,No,Baccarat,2,O,Lionshead,2020-02-19 15:23:08.416876-08:00
...,...,...,...,...,...,...,...,...
3876,1751,17,No,Whispering Pines,2,O,Main Face,2020-02-20 11:41:19.176181-08:00
1914,1752,17,Yes,Zip,1,O,Main Face,2020-02-19 15:23:08.416876-08:00
3877,1752,17,No,Whistler,2,O,Main Face,2020-02-20 11:41:19.176181-08:00
1915,1753,17,No,Rum Run,2,X,Main Face,2020-02-19 15:23:08.416876-08:00


In [156]:
# For example code: Show data for whistler
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(test_df.query('resortID == 13'))

Unnamed: 0_level_0,runID,resortID,groomed,runName,runType,status,terrainName,timestamp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1316,1315,13,No,Robertson's,3,O,Harmony,2020-02-19 15:23:08.416876-08:00
1315,1314,13,No,Rhapsody Bowl,2,O,Harmony,2020-02-19 15:23:08.416876-08:00
1314,1313,13,Yes,Pika's Traverse,1,O,Harmony,2020-02-19 15:23:08.416876-08:00
1313,1312,13,No,McConkey's - Lower,3,O,Harmony,2020-02-19 15:23:08.416876-08:00
1312,1311,13,No,Low Roll,3,O,Harmony,2020-02-19 15:23:08.416876-08:00
1311,1310,13,No,Little Whistler,3,O,Harmony,2020-02-19 15:23:08.416876-08:00
1310,139,13,No,Krumholz,2,O,Harmony,2020-02-19 15:23:08.416876-08:00
1309,138,13,Yes,Harmony Ridge,2,O,Harmony,2020-02-19 15:23:08.416876-08:00
1308,137,13,No,Harmony Piste,2,O,Harmony,2020-02-19 15:23:08.416876-08:00
1307,136,13,No,Harmony Outruns,3,O,Harmony,2020-02-19 15:23:08.416876-08:00


In [157]:
# For example code: Filter dataframe by date
test_df[test_df['timestamp'] > '2020-02-08']

Unnamed: 0_level_0,runID,resortID,groomed,runName,runType,status,terrainName,timestamp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10,1,No,Blue Ox,3,O,Golden Peak,2020-02-19 15:23:08.416876-08:00
1316,1315,13,No,Robertson's,3,O,Harmony,2020-02-19 15:23:08.416876-08:00
1315,1314,13,No,Rhapsody Bowl,2,O,Harmony,2020-02-19 15:23:08.416876-08:00
1314,1313,13,Yes,Pika's Traverse,1,O,Harmony,2020-02-19 15:23:08.416876-08:00
1313,1312,13,No,McConkey's - Lower,3,O,Harmony,2020-02-19 15:23:08.416876-08:00
...,...,...,...,...,...,...,...,...
3381,134,13,Yes,Green Line,1,O,Jersey Cream,2020-02-20 12:11:24.968548-08:00
3380,133,13,Yes,GMC Race Centre,2,O,Jersey Cream,2020-02-20 12:11:24.968548-08:00
3388,1311,13,Yes,Zig Zag,2,O,Jersey Cream,2020-02-20 12:11:24.968548-08:00
2864,91,9,Yes,Outlaw Terrain Park,5,O,Terrain Parks,2020-02-20 13:11:34.345545-08:00


In [158]:
# Example of entities that have the same runName and resortID, but different terrainNames:
test_terrain_df = load_dataframe_from_parquet_on_s3('terrain' + HISTORY_SUFFIX)
test_terrain_df.query('runName == "Bear Paw"')

Unnamed: 0_level_0,runID,resortID,groomed,runName,runType,status,terrainName,timestamp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1287,132,13,No,Bear Paw,3,O,Big Red - Franz's - Garbanzo,2020-02-19 15:23:08.416876-08:00
319,21,2,No,Bear Paw,1,O,Resort Skiways (Access to/from Homesites and L...,2020-02-19 15:23:08.416876-08:00
304,21,2,No,Bear Paw,1,O,Arrowhead,2020-02-19 15:23:08.416876-08:00
3250,132,13,Yes,Bear Paw,3,O,Big Red - Franz's - Garbanzo,2020-02-20 11:41:19.176181-08:00


In [159]:
# Check categories of categorical df column
parq_df.status.cat.categories

Index(['X', 'H', 'O'], dtype='object')

In [160]:
# Print all values for a categorical df column
print(*parq_df.status)

O O O O O O O O O O O O X X O O O X O O O X O O O X O O O X X X X X X X X X X X X X X X X X X X X O O O O O O O O O O O O O O O O O O O O O O O O O X O X X X X X X X X X X X X X X X X X X X X X X X X X X H H H H H H H H H H H H H H H H H O O O O O O O O O O O O O O O O O O O O O O O O O X O X X X X X X X X X X X X X X X X X X X X X X X X X H H H H H H H H H H H H H H H H O O O O O O O O O O O O O O O H O O O O O O O O O O O X X X X X X X X X X X X X X X X X X X X X X X X X X H H H H H H H H H H H H H H H H H O O O O O O O O O O O O O O O O O O O O O O O O O O H H X X O O H X X X X X X X X X X X X X X X X X X X X X X X X X H H H H H H H H H H H H H H H H O O O O O O O O O O O O O O O O O O O O O O O O O H O O X X X X X X X X X X X X X X X X X X X X X X X X X X H H H H H H H H H H H H H H H H H O O O O O O O O O O O O O O O O O O O O O O O O O O X X X X X X X X X X X X X X X X X X X X X X X X X X H H H H H H H H H H H H H H H O O O O O O O O O O O O O O O O O O O O O O O O O O X X X X X 

# Notes
- 

## To do
- live_data class?  Subclass for each subject?
- change time_diff to "duration"

For next phase:
- daily: for each chair calculate most open status of the day: O > H > X
- Days since each chair was last seen open with timestamp of most recent open time.
- snowfall since last open
- save data for other mountains