# Module Code

In [3]:
import json, glob, boto3, os
import pdb
import pandas as pd
from fastparquet import write, ParquetFile
import pytz
import s3fs
import botocore
from collections import Iterable

  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


In [4]:
session = boto3.Session()
BUCKET_NAME = 'snowbot-pv'

# S3 Connect
s3 = session.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)

In [172]:
# parquet engines don't handle shifted timezones
TZ = pytz.timezone('America/Vancouver')

DATA_DIR = "../data/"
MERGED_JSON_FILENAME = "merged_file.json"
merged_json_file = DATA_DIR + MERGED_JSON_FILENAME

# Used for weather data in jsons_to_df()
weather_meta_fields = [
    'newSnow', 'last24Hours', 'last48Hours', 'last7Days', 'midMountainBase',
    'resortID'
]
weather_record_path = ['weather', 'weatherForecast']
weather_meta = [['weather', i] for i in weather_meta_fields]
weather_meta.append('timestamp')

# Used for lift and terrain status in jsons_to_df()
# Important to set categories because when writing incrementally to parquet, some increments
# may not include all statuses.  Manually setting the categories avoids errors due to
# different catergory indexing between increments.
status_cat_dtype = pd.api.types.CategoricalDtype(categories=['X', 'H', 'O'],
                                                 ordered=True)
groomed_cat_dtype = pd.api.types.CategoricalDtype(categories=['No', 'Yes'],
                                                  ordered=True)

# Column dtypes that are to be set for each dataframe
df_dtypes = {
    "lifts": {
        'liftID': 'category',
        'resortID': 'category',
        'liftName': 'category',
        'status': status_cat_dtype,
        'timeToRide': 'object'
    },
    'terrain': {
        'runID': 'category',
        'resortID': 'category',
        'groomed': groomed_cat_dtype,
        'runName': 'category',
        'runType': 'category',
        'status': status_cat_dtype,
        'terrainName': 'category'
    },
    'weather': {
        'resortID': 'category',
        'forecast.dayDescription': 'object',
        'forecast.daycode': 'category',
        'forecast.forecastString': 'object',
        'forecast.iconName': 'object',
        'forecast.summaryDescription': 'object',
        'forecast.temperatureHigh': 'object',
        'forecast.temperatureLow': 'object',
        'weather.last24Hours': 'object',
        'weather.last48Hours': 'object',
        'weather.last7Days': 'object',
        'weather.midMountainBase': 'object',
        'weather.newSnow': 'object'
    }
}


def flatten(items):
    """Yield items from any nested iterable"""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten(x):
                yield sub_x
        else:
            yield x


# The columns that serve to identify records in each table
table_ID_col_names = {
    'lifts': 'liftName',
    'terrain': ['resortID', 'runID', 'runName', 'terrainName'],
    'weather': 'resortID',
    'all_tables': 'timestamp'
}
# All of the column names that serve to identify records in at least one of the tables
all_ID_col_names = set(flatten(table_ID_col_names.values()))

In [173]:
MERGED_JSON_TEST_FILENAME = "test_file.json"
merged_json_test_file = DATA_DIR + MERGED_JSON_TEST_FILENAME

In [174]:
# from https://alexwlchan.net/2019/07/listing-s3-keys/
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    kwargs = {'Bucket': bucket}

    # We can pass the prefix directly to the S3 API.  If the user has passed
    # a tuple or list of prefixes, we go through them one by one.
    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    for key_prefix in prefixes:
        kwargs["Prefix"] = key_prefix

        for page in paginator.paginate(**kwargs):
            try:
                contents = page["Contents"]
            except KeyError:
                return

            for obj in contents:
                key = obj["Key"]
                if key.endswith(suffix):
                    yield obj


def get_matching_s3_keys(bucket, prefix="", suffix=""):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj["Key"]


def merge_matching_jsons_on_s3(save_file, prefix="", suffix=""):
    """Merges json files on S3 that match the suffix into a new json and save it
    as the save_file on S3."""

    result = []

    for f in get_matching_s3_keys(BUCKET_NAME, prefix=prefix, suffix=suffix):

        # Write the file from S3 into a local temp file
        with open('temp', 'wb') as tfw:
            bucket.download_fileobj(f, tfw)

        # Append the local temp file into the result list
        with open('temp', 'rb') as tfr:
            result.append(json.load(tfr))

    os.remove("temp")

    # Fill the output file with the merged content
    with open(save_file, "w") as outfile:
        json.dump(result, outfile)

# TBD: more efficient to go straight to df w/o saving json to file


def set_df_datatypes(df, table):
    """Set the datatypes for a df according to the table of information that
    it represents."""
    df = df.astype(df_dtypes[table])
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    return df


def jsons_to_df(jsons, record_path, meta='timestamp'):
    """Convert a list of jsons to a dataframe."""
    if record_path == 'weather':
        # Deal with the nested object that the weather data uses to store the weather forecast
        df = pd.json_normalize(jsons, record_path=weather_record_path,
                               meta=weather_meta, record_prefix='forecast.')
        df.rename(columns={"weather.resortID": "resortID"}, inplace=True)
    else:
        df = pd.json_normalize(jsons, record_path=record_path,
                               meta=meta)

    df = set_df_datatypes(df, record_path)
    return df


def load_merged_json_as_df(merged_json_file, record_path):
    """Load the merged json file as a dataframe."""
    with open(merged_json_file, "r") as f:
        d = json.load(f)
        df = jsons_to_df(d, record_path)
        return df


# TBD: test filter_for_data_changes on unsorted data
def get_data_changes(df, table, keep_oldest=False):
    """
    Filter out rows that do not represent changed data.

    Parameters
    ----------
    df : pandas.DataFrame
        Includes 'timestamp' identifying and data columns.  Lists data for each timestamp.
    keep_oldest : boolean
        Indicates if the returned DataFrame should keep the oldest record for each entity (i.e.
        lift, resort, tor terrain) even if an entity has no data changes.  This is so that the
        earliest data for each entity is not lost, and all entities are listed the returned DataFrame
        even if their data has not changed.  Use `False` when there is just one DataFrame to process.
        Use `True` is cases where the data changes will be appended to an existing dataframe that
        already has at least one row for each entity.

    Returns
    -------
    pandas.DataFrame
        Only includes the rows from the original dataframe where there was a change to new values
        in the data columns.
    """
    ID_columns = table_ID_col_names[table]
    data_columns = [c for c in df.columns if c not in all_ID_col_names]

    def filter_for_data_changes(df, keep_oldest=keep_oldest):
        """Filter out rows where data is unchanged for adjacent timestamps."""
        keep_idx = df[data_columns].ne(df[data_columns].shift()).any(
            axis=1).values[1:]  # True for rows with data changes
        changed_rows = df.reset_index(drop=True).drop(index=0)[keep_idx]

        if keep_oldest:
            firstrow = df.loc[df['timestamp'].idxmin()]
            keep_df = firstrow.to_frame().T.append(changed_rows)
        else:
            keep_df = changed_rows

        return keep_df

    df = df.groupby(ID_columns, group_keys=False)\
           .apply(filter_for_data_changes)\
           .reset_index(drop=True)

    df = set_df_datatypes(df, table)

    return df

### Test code

In [8]:
test_df = load_merged_json_as_df(merged_json_test_file, 'lifts')
test_df.sort_values(by=['liftID', 'timestamp'])

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
11,3,13,7th Heaven Express,X,6,2020-01-03 00:19:09.631011-08:00
40,3,13,7th Heaven Express,X,6,2020-01-03 00:19:09.631011-08:00
6,4,13,Jersey Cream Express,X,5,2020-01-03 00:19:09.631011-08:00
35,4,13,Jersey Cream Express,X,5,2020-01-03 00:19:09.631011-08:00
2,5,13,Excalibur Gondola Lower,X,3,2020-01-03 00:19:09.631011-08:00
31,5,13,Excalibur Gondola Lower,X,3,2020-01-03 00:19:09.631011-08:00
5,6,13,Magic Chair,X,6,2020-01-03 00:19:09.631011-08:00
34,6,13,Magic Chair,X,6,2020-01-03 00:19:09.631011-08:00
10,7,13,Glacier Express,X,6,2020-01-03 00:19:09.631011-08:00
39,7,13,Glacier Express,X,6,2020-01-03 00:19:09.631011-08:00


In [9]:
# TBD: test that len is 1 when keep_oldest=False and larger otherwise
get_data_changes(test_df, 'lifts', keep_oldest=False).sort_values(by=['liftID', 'timestamp'])

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,69,13,Blackcomb Gondola Lower,O,7,2020-01-03 00:19:09.631011-08:00


# Process lift json fies

In [10]:
merge_matching_jsons_on_s3(prefix="2020_02", suffix="lifts.json", save_file=merged_json_file)

In [11]:
lifts_df = load_merged_json_as_df(merged_json_file, 'lifts')

lifts_status_changes_df = get_data_changes(lifts_df, 'lifts', keep_oldest=True)

**NOTE:** `timeToRide` is just the time is takes to ride the lift, not the current wait time:

In [12]:
lifts_df.groupby("liftName")['timeToRide'].unique()

liftName
7th Heaven Express                 [6]
Big Red Express                    [8]
Blackcomb Gondola Lower            [7]
Blackcomb Gondola Upper            [7]
Catskinner Express                 [4]
Coca-Cola Tube Park                [4]
Creekside Gondola                  [7]
Crystal Ridge Express              [7]
Emerald 6 Express                  [6]
Excalibur Gondola Lower            [3]
Excalibur Gondola Upper            [5]
Excelerator Express                [6]
Fitzsimmons Express                [6]
Franz's Chair                      [8]
Garbanzo Express                   [7]
Glacier Express                    [6]
Harmony 6 Express                  [6]
Horstman T-Bar                     [4]
Jersey Cream Express               [5]
Magic Chair                        [6]
Olympic Chair                      [5]
Peak 2 Peak Gondola               [12]
Peak Express                       [3]
Showcase T-Bar                     [3]
Symphony Express                   [7]
T-Bars          

In [13]:
lifts_df

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,69,13,Blackcomb Gondola Lower,X,7,2020-02-01 00:00:27.181139-08:00
1,70,13,Blackcomb Gondola Upper,X,7,2020-02-01 00:00:27.181139-08:00
2,5,13,Excalibur Gondola Lower,X,3,2020-02-01 00:00:27.181139-08:00
3,71,13,Excalibur Gondola Upper,X,5,2020-02-01 00:00:27.181139-08:00
4,8,13,Excelerator Express,X,6,2020-02-01 00:00:27.181139-08:00
...,...,...,...,...,...,...
15191,42,13,Symphony Express,O,7,2020-02-06 12:01:09.013398-08:00
15192,41,13,T-Bars,O,5,2020-02-06 12:01:09.013398-08:00
15193,11,13,Horstman T-Bar,X,4,2020-02-06 12:01:09.013398-08:00
15194,45,13,Fitzsimmons Express,X,6,2020-02-06 12:01:09.013398-08:00


In [14]:
def get_status_durations(lifts_df):
    '''Calculate values and add columns for the time difference between the
    timestamp for the current status and the timestamp for the next status
    for each lift:
    `time_diff` column: Gives the duration that the lift was in the status indicated in the `status` column.
    `time_diff_seconds` column: `time_diff` converted to seconds.
    
    lifts_status_changes_df should be TBD
    '''
    # TBD: optimize if needed via # 3 under:
    # https://towardsdatascience.com/pandas-tips-and-tricks-33bcc8a40bb9
    df = lifts_df.sort_values(by=['resortID', 'liftID', 'timestamp'])
    df['time_diff'] = df.groupby(['resortID', 'liftID'])['timestamp'].diff(1).shift(-1)

    # Fill in the durations which will be missing for the most recent status changes
    missing_time_diffs_idx = df.loc[(df['time_diff'].isnull()) & (
        df['timestamp'] >= df['timestamp'].min()), 'timestamp'].index.values

    df.loc[missing_time_diffs_idx, 'time_diff'] = df['timestamp'].max(
    ) - df.loc[missing_time_diffs_idx, 'timestamp']

    # Convert to seconds
    df['time_diff_seconds'] = df['time_diff'].dt.total_seconds()

    return df

In [15]:
df = get_status_durations(lifts_status_changes_df)

# Uses local date formatting, otherwise Tableau will mix up month and day
# alternatively, can export to json:
# lifts_status_changes_df.to_json(DATA_DIR + "lifts_status_changes.json", orient='table')
df.to_csv(DATA_DIR + "lifts_status_changes.csv", date_format='%c')


In [17]:
# add:
# 
# daily: for each chair calculate most open status of the day: O > H > X
# Days since each chair was last seen open with timestamp of most recent open time.
# snowfall since last open
# save data for other mountains

# Storage options testing

In [None]:
df.to_pickle(DATA_DIR + "df_test.pkl")

In [None]:
from fastparquet import write

# parquet engines don't handle shifted timezones
import pytz
TZ = pytz.timezone('America/Vancouver')
df['timestamp'] = df.timestamp.dt.tz_convert(pytz.utc)

In [None]:
# Note: May need snappy-python as a req to run on AWS Lambda
df.to_parquet(DATA_DIR + "df_test.parquet", engine='fastparquet')

In [None]:
load_df = pd.read_parquet(DATA_DIR + "df_test.parquet")
load_df['timestamp'] = load_df.timestamp.dt.tz_convert(TZ) # convert back to correct timezone


In [None]:
#TBD convert back to correct datatypes
load_df.dtypes

In [None]:
df.to_csv(DATA_DIR + "df_test.csv")

Test file size results:
- json: 800 Kb?
- csv: 474 Kb
- pickle: 145 Kb
- parquet: 15 Kb

## Delta Lake Notes

Requires apache spark instance.  For future use, could set one up to work with lambda using https://aws.amazon.com/emr/features/spark/?

Otherwise databricks (similar to QxMD project)

# json comparison and parquet to S3

In [175]:
fs = s3fs.S3FileSystem()
myopen = fs.open
nop = lambda *args, **kwargs: None

In [176]:
def write_dataframe_to_parquet_on_s3(df, table, fname):
    """ Write a dataframe to a Parquet file on S3.  Creates a new parquet file if one
    doesn't already exist.
    """

    def write_parquet(df, fname, app=True):

        output_file = f"s3://{BUCKET_NAME}/{fname}"
        write(output_file,
              df,
              # partition_on=['timestamp'],
              file_scheme='hive',
              append=app,  # need to remove or catch exception to work when file doesn't exist
              open_with=myopen,
              mkdirs=nop)
        print(f"Writing {len(df)} records to {fname}.")

    # Unshift the timezone because parquet engines don't handle shifted timezones
    df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(pytz.utc)

    s3_object = s3.Object(BUCKET_NAME, fname)

    if not list(bucket.objects.filter(Prefix=fname)):
        print(f"File {fname} not found.  Creating new file.")
        # Keep oldest record for each entity because creating new file
        df = get_data_changes(df, table=table, keep_oldest=True)
        write_parquet(df, fname, app=False)

    else:
        print(f"File {fname} found on S3.")
        df = get_data_changes(df, table=table, keep_oldest=False)
        write_parquet(df, fname, app=True)

## Generalized version
For all data from the EpicMix API

In [177]:
import requests
from datetime import datetime


def filter_resort(data, resortID: int = None) -> dict:
    """Filter for a specific resort."""
    if resortID:
        return data["resortID"] == resortID
    else:
        return data


def get_data(filter_resortID: int = None) -> dict:
    """Defaults to all resorts.  Option to filter for a specific resort"""
    API_URL = 'http://www.epicmix.com/vailresorts/sites/epicmix/api/mobile/'
    # keys are used in the requests, the values and used in the response
    DATA_LIST = {'lifts': 'lifts',
                 'weather': 'snowconditions', 'terrain': 'terrains'}
    json_data = dict()

    for d, name in DATA_LIST.items():
        res = requests.get(API_URL + d + '.ashx')
        res.raise_for_status()
        data = json.loads(res.text)[name]
        data = list(filter(lambda x: filter_resort(x, filter_resortID), data))
        json_data[d] = json.dumps(
            {'timestamp': str(datetime.now(TZ)), d: data})

    return json_data

In [178]:
def s3_object_exists(fname):
    """Check if an s3 object exists.  Returns `True` if the object exists."""
    try:
        s3.Object(BUCKET_NAME, fname)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print(f"{fname} doesn't exist")
        else:
            raise
    return True


def load_dataframe_from_parquet_on_s3(fname):
    """ Load a dataframe from a Parquet file on S3. """
    if s3_object_exists(fname):
        read_file = f"s3://{BUCKET_NAME}/{fname}"
        pf = ParquetFile(read_file, open_with=myopen)
        df = pf.to_pandas()

        # Reshift the timezone because parquet engines don't handle shifted timezones
        df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(TZ)

        return df

TBD:
replace: s3.Object(BUCKET_NAME, '') -> bucket.Object('')

In [258]:
HISTORY_SUFFIX = '_history_DEV.parquet'
PRIOR_SUFFIX = '_prior_DEV.json'


class ParquetWriter():
    """Identifies new data and writes it to Parquet file on S3."""

    def __init__(self):
        # Get current data
        self.data_current_all = get_data()  # String.

    def write_new_data_all(self):
        """Writes new data for each type (i.e. 'lift', 'weather', 'terrian')
        of data returned by the API.
        """
        # self.table is the type of data
        for self.table in self.data_current_all:
            self.current_json = json.loads(self.data_current_all[self.table])
            self.prior_fname = self.table + PRIOR_SUFFIX
            self.prior_object = s3.Object(BUCKET_NAME, self.prior_fname)
            self.write_new_data()

    def write_new_data(self):
        """If current data has changed since the last update of Parquet file is, add it
        to the Parquet file.  Save the current data as json to serve as the prior for
        the next comparison.
        """

        # Get prior data json
        try:
            self.prior_object.load()
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print(f"Prior json for {self.table} doesn't exist")
                # Create the prior file
                self.save_prior_data()
                print(f"Created {self.prior_fname}")
            else:
                # Something else has gone wrong.
                raise
        else:
            # The prior data file exists
            self.get_prior_data()
            if self.data_changed():

                # Get a df with the chages between the prior and current json data
                df = jsons_to_df(
                    [self.prior_json, self.current_json], record_path=self.table)
                write_dataframe_to_parquet_on_s3(
                    df, self.table, self.table + HISTORY_SUFFIX)

                # save current data json as prior
                self.save_prior_data()
                print(
                    f"Replaced data in {self.prior_object.key} with current data.")
        print('\n')

    def get_prior_data(self):
        """Get prior data json from S3."""
        prior = self.prior_object.get()['Body'].read().decode('utf-8')
        self.prior_json = json.loads(prior)
        print(f"Loaded prior {self.table} json data from S3")

    def data_changed(self):
        """Compare current data json with prior data json without their timestamps.  The timestamps
        on the current json will always be more recent even when none of the other data has changed.
        """
        if self.prior_json[self.table] == self.current_json[self.table]:
            print(
                f"No differences between current and prior {self.table} data were found.")
            return False
        else:
            print(
                f"Found differences between current and prior {self.table} data.")
            return True

    def save_prior_data(self):
        """Save the current data as prior data on S3."""
        bucket.put_object(Key=self.prior_fname,
                          Body=bytes(json.dumps(self.current_json).encode('UTF-8')))

In [204]:
data = get_data()

In [207]:
type(data)

dict

In [268]:
# refactor ParquetWriter


class api_data():
    def __init__(self, topic: str, current_json: str):
        self.topic = topic
        self.current_json = current_json
        # May not exist yet
        self.prior_fname = topic + PRIOR_SUFFIX
        self.prior_object = s3.Object(BUCKET_NAME, self.prior_fname)
        self.check_prior_object()

    def check_prior_object(self):
        """Get prior data json"""
        try:
            self.prior_object.load()
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print(f"Prior json for {self.topic} doesn't exist")
                self.prior_exists = False
            else:
                # Something else has gone wrong.
                raise
        else:
            self.prior_exists = True
        return self.prior_exists

    def get_prior_data_json(self):
        """Get prior data json from S3."""
        if self.prior_exists == True:
            prior = self.prior_object.get()['Body'].read().decode('utf-8')
            self.prior_json = json.loads(prior)
            print(f"Loaded prior {self.topic} json data from S3")
            return self.prior_json
        else:
            print(f"Prior json for {self.topic} doesn't exist")

    def data_changed(self):
        """Compare current data json with prior data json without their timestamps.  The timestamps
        on the current json will always be more recent even when none of the other data has changed.
        """
        #pdb.set_trace()
        if self.prior_json[self.topic] == self.current_json[self.topic]:
            print(
                f"No differences between current and prior {self.topic} data were found.")
            return False
        else:
            print(
                f"Found differences between current and prior {self.topic} data.")
            return True

    def save_prior_data(self):
        """Save the current data as prior data on S3."""
        bucket.put_object(Key=self.prior_fname,
                          Body=bytes(json.dumps(self.current_json).encode('UTF-8')))


class ParquetWriter():
    """Identifies new data and writes it to Parquet file on S3."""

    def __init__(self):
        # Get current data
        self.data_current_all = get_data()  # String.

    def write_new_data_all(self):
        """Writes new data for each type (i.e. 'lift', 'weather', 'terrian')
        of data returned by the API.
        """
        # self.table is the type of data
        for topic in self.data_current_all:
            current_json = json.loads(self.data_current_all[topic])
            data = api_data(topic, current_json)
            self.write_new_data(data)

    def write_new_data(self, api_data):
        """If current data has changed since the last update of Parquet file is, add it
        to the Parquet file.  Save the current data as json to serve as the prior for
        the next comparison.
        """

        if api_data.prior_exists:
            api_data.get_prior_data_json()
            if api_data.data_changed():
                # Get a df with the chages between the prior and current json data
                df = jsons_to_df(
                    [api_data.prior_json, api_data.current_json], record_path=api_data.topic)
                write_dataframe_to_parquet_on_s3(
                    df, api_data.topic, api_data.topic + HISTORY_SUFFIX)

                # save current data json as prior
                api_data.save_prior_data()
                print(
                    f"Replaced data in {api_data.prior_object.key} with current data.")
        else:
            print(f"Prior json for {api_data.topic} doesn't exist")
            # Create the prior file
            api_data.save_prior_data()
            print(f"Created {api_data.prior_fname}")
        print('\n')

In [269]:
%%time
pr = ParquetWriter()
pr.write_new_data_all()

Prior json for lifts doesn't exist
Prior json for lifts doesn't exist
Created lifts_prior_DEV.json


Prior json for weather doesn't exist
Prior json for weather doesn't exist
Created weather_prior_DEV.json


Prior json for terrain doesn't exist
Prior json for terrain doesn't exist
Created terrain_prior_DEV.json


CPU times: user 142 ms, sys: 23 ms, total: 165 ms
Wall time: 2.61 s


In [261]:
%%time
pr = ParquetWriter()
pr.write_new_data_all()

Loaded prior lifts json data from S3
No differences between current and prior lifts data were found.


Loaded prior weather json data from S3
No differences between current and prior weather data were found.


Loaded prior terrain json data from S3
No differences between current and prior terrain data were found.


CPU times: user 143 ms, sys: 13.3 ms, total: 157 ms
Wall time: 1.7 s


In [183]:
import time
while True:
    print('\n\n\n' + time.ctime() + ':\n---------------------')
    pr = ParquetWriter()
    pr.write_new_data_all()
    time.sleep(1200)




Sun Feb  9 17:14:10 2020:
---------------------
Loaded prior lifts json data from S3
Found differences between current and prior lifts data.
File lifts_history_DEV.parquet found on S3.


  index_cols = [{'name': index_cols.name, 'start': index_cols._start,
  'stop': index_cols._stop, 'step': index_cols._step,
  'stop': index_cols._stop, 'step': index_cols._step,


Writing 40 records to lifts_history_DEV.parquet.
Replaced data in lifts_prior_DEV.json with current data.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 14 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Prior json for terrain doesn't exist
Created terrain_prior_DEV.json





Sun Feb  9 17:34:16 2020:
---------------------
Loaded prior lifts json data from S3
No differences between current and prior lifts data were found.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 2 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Loaded prior terrain json data from S3
No differences between current and prior terrain data were found.





Sun Feb  9 17:54:21 2020:
---------------------
L

File weather_history_DEV.parquet found on S3.
Writing 4 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Loaded prior terrain json data from S3
No differences between current and prior terrain data were found.





Sun Feb  9 22:35:29 2020:
---------------------
Loaded prior lifts json data from S3
No differences between current and prior lifts data were found.


Loaded prior weather json data from S3
Found differences between current and prior weather data.
File weather_history_DEV.parquet found on S3.
Writing 1 records to weather_history_DEV.parquet.
Replaced data in weather_prior_DEV.json with current data.


Loaded prior terrain json data from S3
No differences between current and prior terrain data were found.





Sun Feb  9 22:55:35 2020:
---------------------
Loaded prior lifts json data from S3
No differences between current and prior lifts data were found.


Loaded prior weather json data from S3
Found differences between cu

KeyboardInterrupt: 

### Testing

In [22]:
prior_object = s3.Object(BUCKET_NAME, 'weather' + PRIOR_SUFFIX)
prior = prior_object.get()[
            'Body'].read().decode('utf-8')
weather_prior_json = json.loads(prior)

prior_object = s3.Object(BUCKET_NAME, 'terrain' + PRIOR_SUFFIX)
prior = prior_object.get()[
            'Body'].read().decode('utf-8')
terrain_prior_json = json.loads(prior)

prior_object = s3.Object(BUCKET_NAME, 'lifts' + PRIOR_SUFFIX)
prior = prior_object.get()[
            'Body'].read().decode('utf-8')
lifts_prior_json = json.loads(prior)

In [23]:
weather_prior_json

{'timestamp': '2020-02-06 12:15:01.284480-08:00',
 'weather': [{'newSnow': '1',
   'last24Hours': '2',
   'last48Hours': '3',
   'last7Days': '13',
   'midMountainBase': '58',
   'resortID': 1,
   'weatherForecast': [{'daycode': 0,
     'dayDescription': 'Today',
     'forecastString': '',
     'iconName': 'fog',
     'summaryDescription': 'Foggy',
     'temperatureHigh': '25',
     'temperatureLow': '4'}]},
  {'newSnow': '1',
   'last24Hours': '1',
   'last48Hours': '2',
   'last7Days': '10',
   'midMountainBase': '48',
   'resortID': 2,
   'weatherForecast': [{'daycode': 0,
     'dayDescription': 'Today',
     'forecastString': '',
     'iconName': 'fog',
     'summaryDescription': 'Foggy',
     'temperatureHigh': '26',
     'temperatureLow': '4'}]},
  {'newSnow': '1',
   'last24Hours': '1',
   'last48Hours': '2',
   'last7Days': '9',
   'midMountainBase': '51',
   'resortID': 3,
   'weatherForecast': [{'daycode': 0,
     'dayDescription': 'Today',
     'forecastString': '',
     'ic

In [93]:
# TBD: check that weather_prior_json['weather'][0].keys() matches list of expected columns (in case new ones are in use)
# Add as exception handling for the other tables as well
weather_prior_json['weather'][0].keys()

dict_keys(['newSnow', 'last24Hours', 'last48Hours', 'last7Days', 'midMountainBase', 'resortID', 'weatherForecast'])

In [221]:
weather_meta_fields = ['newSnow', 'last24Hours', 'last48Hours', 'last7Days', 'midMountainBase', 'resortID']
weather_meta = [['weather', i] for i in weather_meta_fields]
weather_meta.append('timestamp')
weather_record_path=['weather', 'weatherForecast']

In [222]:
weather_meta

[['weather', 'newSnow'],
 ['weather', 'last24Hours'],
 ['weather', 'last48Hours'],
 ['weather', 'last7Days'],
 ['weather', 'midMountainBase'],
 ['weather', 'resortID'],
 'timestamp']

In [223]:
test_weather = pd.json_normalize(
    data=weather_prior_json,
    record_path=['weather', 'weatherForecast'],
    meta=weather_meta,
    record_prefix='forecast.'


)



forecast.daycode                int64
forecast.dayDescription        object
forecast.forecastString        object
forecast.iconName              object
forecast.summaryDescription    object
forecast.temperatureHigh       object
forecast.temperatureLow        object
weather.newSnow                object
weather.last24Hours            object
weather.last48Hours            object
weather.last7Days              object
weather.midMountainBase        object
weather.resortID               object
timestamp                      object
dtype: object

In [246]:
test_weather.rename(columns={"weather.resortID": "resortID"}, inplace=True)

In [249]:
set(test_weather.columns)

{'forecast.dayDescription',
 'forecast.daycode',
 'forecast.forecastString',
 'forecast.iconName',
 'forecast.summaryDescription',
 'forecast.temperatureHigh',
 'forecast.temperatureLow',
 'resortID',
 'timestamp',
 'weather.last24Hours',
 'weather.last48Hours',
 'weather.last7Days',
 'weather.midMountainBase',
 'weather.newSnow'}

In [23]:
terrain_prior_json

{'timestamp': '2020-02-04 20:37:27.159818-08:00',
 'terrain': [{'runID': 10,
   'resortID': 1,
   'groomed': 'Yes',
   'runName': 'Blue Ox',
   'runType': 3,
   'status': 'O',
   'terrainName': 'Golden Peak'},
  {'runID': 11,
   'resortID': 1,
   'groomed': 'No',
   'runName': 'Boomer',
   'runType': 1,
   'status': 'O',
   'terrainName': 'Golden Peak'},
  {'runID': 12,
   'resortID': 1,
   'groomed': 'No',
   'runName': 'Brisk Walk',
   'runType': 1,
   'status': 'O',
   'terrainName': 'Golden Peak'},
  {'runID': 13,
   'resortID': 1,
   'groomed': 'Yes',
   'runName': 'Choker Cutoff',
   'runType': 2,
   'status': 'O',
   'terrainName': 'Golden Peak'},
  {'runID': 14,
   'resortID': 1,
   'groomed': 'No',
   'runName': 'Fall Line',
   'runType': 3,
   'status': 'O',
   'terrainName': 'Golden Peak'},
  {'runID': 15,
   'resortID': 1,
   'groomed': 'No',
   'runName': 'First Step',
   'runType': 3,
   'status': 'O',
   'terrainName': 'Golden Peak'},
  {'runID': 16,
   'resortID': 1,
  

In [244]:
test_terrain = pd.json_normalize(
    data=terrain_prior_json,
    record_path=['terrain'],
    meta='timestamp'
)

test_terrain.columns

Index(['runID', 'resortID', 'groomed', 'runName', 'runType', 'status',
       'terrainName', 'timestamp'],
      dtype='object')

In [160]:
terrain=set(terrain_prior_json['terrain'][0].keys())
lifts=set(lifts_prior_json['lifts'][0].keys())
weather=set(weather_prior_json['weather'][0].keys())

In [245]:
weather

{'last24Hours',
 'last48Hours',
 'last7Days',
 'midMountainBase',
 'newSnow',
 'resortID',
 'weatherForecast'}

In [165]:
terrain.intersection(lifts)

{'resortID', 'status'}

In [270]:
jsons_to_df(lifts_prior_json, 'lifts').dtypes

liftID                                      category
resortID                                    category
liftName                                    category
status                                      category
timeToRide                                     int64
timestamp     datetime64[ns, pytz.FixedOffset(-480)]
dtype: object

In [276]:
jsons_to_df(terrain_prior_json, 'terrain').dtypes

runID                                        category
resortID                                     category
groomed                                      category
runName                                      category
runType                                      category
status                                       category
terrainName                                  category
timestamp      datetime64[ns, pytz.FixedOffset(-480)]
dtype: object

In [272]:
jsons_to_df(weather_prior_json, 'weather').dtypes

forecast.daycode                                             category
forecast.dayDescription                                        string
forecast.forecastString                                        string
forecast.iconName                                              string
forecast.summaryDescription                                    string
forecast.temperatureHigh                                        int64
forecast.temperatureLow                                         int64
weather.newSnow                                                 int64
weather.last24Hours                                             int64
weather.last48Hours                                             int64
weather.last7Days                                               int64
weather.midMountainBase                                         int64
resortID                                                     category
timestamp                      datetime64[ns, pytz.FixedOffset(-480)]
dtype: object

In [273]:
test_terrain.status.unique()

array(['O', 'X'], dtype=object)

In [274]:
??pd.json_normalize

In [207]:
parq_df.dtypes

liftID                                 category
resortID                               category
liftName                               category
status                                 category
timeToRide                                int64
timestamp     datetime64[ns, America/Vancouver]
dtype: object

### Load parquet and save as .csv

In [184]:
parq_df = load_dataframe_from_parquet_on_s3('terrain' + HISTORY_SUFFIX)

In [185]:
parq_df.sort_values(['resortID', 'runID', 'runName', 'timestamp'])

Unnamed: 0,runID,resortID,groomed,runName,runType,status,terrainName,timestamp
0,10,1,No,38,3,O,Vail Village,2020-02-09 17:14:12.103517-08:00
1,10,1,No,Apres Vous,3,O,Back Bowls,2020-02-09 17:14:12.103517-08:00
2,10,1,No,Baccarat,2,O,Lionshead,2020-02-09 17:14:12.103517-08:00
3,10,1,No,Big Rock Park,2,X,Blue Sky Basin,2020-02-09 17:14:12.103517-08:00
2179,10,1,Yes,Big Rock Park,2,X,Blue Sky Basin,2020-02-10 07:28:08.371531-08:00
...,...,...,...,...,...,...,...,...
2089,1749,17,No,Whispering Pines,2,O,Main Face,2020-02-09 17:14:12.103517-08:00
2090,1750,17,No,Whistler,2,O,Main Face,2020-02-09 17:14:12.103517-08:00
2091,1751,17,Yes,Zip,1,O,Main Face,2020-02-09 17:14:12.103517-08:00
2092,1752,17,Yes,Black Out,3,X,Main Face,2020-02-09 17:14:12.103517-08:00


In [186]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(parq_df.sort_values(['resortID', 'runID', 'runName', 'timestamp']).query('resortID == 13'))

Unnamed: 0,runID,resortID,groomed,runName,runType,status,terrainName,timestamp
1248,130,13,Yes,7th Avenue,1,O,7th Heaven,2020-02-09 17:14:12.103517-08:00
1249,130,13,No,7th Avenue,1,O,7th Heaven,2020-02-10 00:15:51.145508-08:00
2144,130,13,Yes,7th Avenue,1,O,7th Heaven,2020-02-10 04:05:29.776688-08:00
1250,130,13,Yes,Adagio - Lower,2,O,Symphony Amphitheatre,2020-02-09 17:14:12.103517-08:00
1251,130,13,No,Adagio - Lower,2,O,Symphony Amphitheatre,2020-02-10 00:15:51.145508-08:00
2276,130,13,Yes,Adagio - Lower,2,O,Symphony Amphitheatre,2020-02-10 07:28:08.371531-08:00
1252,130,13,Yes,Adult Learning/ Supercarpet,1,O,Village - Olympic - Fitzsimmons,2020-02-09 17:14:12.103517-08:00
1253,130,13,No,Adult Learning/ Supercarpet,1,O,Village - Olympic - Fitzsimmons,2020-02-10 00:15:51.145508-08:00
2277,130,13,Yes,Adult Learning/ Supercarpet,1,O,Village - Olympic - Fitzsimmons,2020-02-10 07:28:08.371531-08:00
1254,130,13,No,Arthur's Choice,3,O,Crystal Zone,2020-02-09 17:14:12.103517-08:00


In [96]:
parq_df = parq_df[parq_df['timestamp'] > '2020-02-08']

In [187]:
parq_df_dup = parq_df[parq_df.duplicated(subset=['resortID', 'runID', 'runName'], keep=False)]

For testing terrain:
resorts:21, UD: 2, Name "Bear Paw" . Terrain: "Arrowhead" and "Resort Skiways (Access to/from Homesites and Lodging)" 

In [188]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(parq_df_dup.sort_values(['resortID', 'runID', 'runName', 'timestamp']))

Unnamed: 0,runID,resortID,groomed,runName,runType,status,terrainName,timestamp
3,10,1,No,Big Rock Park,2,X,Blue Sky Basin,2020-02-09 17:14:12.103517-08:00
2179,10,1,Yes,Big Rock Park,2,X,Blue Sky Basin,2020-02-10 07:28:08.371531-08:00
17,12,1,Yes,Emperor's Choice,3,O,China Bowl,2020-02-09 17:14:12.103517-08:00
2180,12,1,No,Emperor's Choice,3,O,China Bowl,2020-02-10 07:28:08.371531-08:00
39,16,1,No,Grand Review,2,X,Blue Sky Basin,2020-02-09 17:14:12.103517-08:00
2181,16,1,Yes,Grand Review,2,X,Blue Sky Basin,2020-02-10 07:28:08.371531-08:00
46,17,1,No,Morning Thunder,3,O,China Bowl,2020-02-09 17:14:12.103517-08:00
2182,17,1,Yes,Morning Thunder,3,O,China Bowl,2020-02-10 07:28:08.371531-08:00
49,18,1,No,Cold Feet,1,O,Vail Village,2020-02-09 17:14:12.103517-08:00
2183,18,1,Yes,Cold Feet,1,O,Vail Village,2020-02-10 07:28:08.371531-08:00


We can uniquely identify each run via a combination of: `resortID`, `runID` and `runName`:

In [202]:
assert(
    parq_df.drop_duplicates(subset=['resortID', 'runID', 'runName'], keep=False).equals(
    parq_df.drop_duplicates(subset=['resortID', 'runID', 'runName', 'terrainName'], keep=False))
)

AssertionError: 

In [200]:
df1 = parq_df.drop_duplicates(subset=['resortID', 'runID', 'runName'])
df2 = parq_df.drop_duplicates(subset=['resortID', 'runID', 'runName', 'terrainName'])

df = pd.concat([df1, df2]) # concat dataframes
df = df.reset_index(drop=True) # reset the index
df_gpby = df.groupby(list(df.columns)) #group by
idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1] #reindex
parq_df.reindex(idx)

Unnamed: 0,runID,resortID,groomed,runName,runType,status,terrainName,timestamp
2171,1314,13,Yes,Tube Park ByPass,1,O,Excalibur – Blackcomb Gondola Lower,2020-02-10 04:05:29.776688-08:00


In [24]:
parq_df = load_dataframe_from_parquet_on_s3('lifts' + HISTORY_SUFFIX)
lifts_status_changes_parq_df = get_status_durations(parq_df)
lifts_status_changes_parq_df.to_csv(
    DATA_DIR + "lifts_status_changes_parq.csv", date_format='%c')

TBD: testing
- no NaN
- result is expected size
- no duplicate rows
- no duplicate information in adjacent rows by time

### S3 Object Deletion

In [266]:
s3.Object(BUCKET_NAME, 'lifts' + PRIOR_SUFFIX).delete()
s3.Object(BUCKET_NAME, 'terrain' + PRIOR_SUFFIX).delete()
s3.Object(BUCKET_NAME, 'weather' + PRIOR_SUFFIX).delete()

{'ResponseMetadata': {'RequestId': 'ABED6568666AEFF9',
  'HostId': 'EhFYPLGfvpuXY/NdjljaP9M7VoUo9tmYg4zJyoc85M3ejj/i3c+ku8S137afe7SwSYpkXEpWCx4=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'EhFYPLGfvpuXY/NdjljaP9M7VoUo9tmYg4zJyoc85M3ejj/i3c+ku8S137afe7SwSYpkXEpWCx4=',
   'x-amz-request-id': 'ABED6568666AEFF9',
   'date': 'Mon, 10 Feb 2020 19:15:41 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

In [267]:
bucket.objects.filter(Prefix='lifts' + HISTORY_SUFFIX + '/').delete()
bucket.objects.filter(Prefix='terrain' + HISTORY_SUFFIX + '/').delete()
bucket.objects.filter(Prefix='weather' + HISTORY_SUFFIX + '/').delete()

[{'ResponseMetadata': {'RequestId': '12AC49F0361A723D',
   'HostId': 'ZGosEYms5V8EF12ROQZpGvMzqd7O4/D+JAD+5QGbYmLSUnfFsBBRi701ZXt1yFYa9Jsump3FksM=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'ZGosEYms5V8EF12ROQZpGvMzqd7O4/D+JAD+5QGbYmLSUnfFsBBRi701ZXt1yFYa9Jsump3FksM=',
    'x-amz-request-id': '12AC49F0361A723D',
    'date': 'Mon, 10 Feb 2020 19:15:49 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'weather_history_DEV.parquet/part.29.parquet'},
   {'Key': 'weather_history_DEV.parquet/part.26.parquet'},
   {'Key': 'weather_history_DEV.parquet/part.63.parquet'},
   {'Key': 'weather_history_DEV.parquet/part.87.parquet'},
   {'Key': 'weather_history_DEV.parquet/part.84.parquet'},
   {'Key': 'weather_history_DEV.parquet/part.11.parquet'},
   {'Key': 'weather_history_DEV.parquet/part.54.parquet'},
   {'Key': 'weather_history_DEV.parquet/part.5

## Original Version
Handles lifts only

In [14]:


os.chdir("../src/data/snowbot_AWS_lambda/")
from scrape import get_data
os.chdir("../../../notebooks")

In [47]:
HISTORY_FNAME = 'wb_lifts_history.parquet'
PRIOR_STATUS_FNAME = 'lifts_prior.json'

class ParquetWriter():
    """Identifies new data and writes it to Parquet file on S3."""

    def __init__(self):
        # Get current lift status info json
        lifts_current = get_data()['lifts']  # String.
        self.lifts_current_json = json.loads(lifts_current)

        self.lifts_prior_object = s3.Object(BUCKET_NAME, PRIOR_STATUS_FNAME)

    def write_new_data(self):
        """If new data since the last update of Parquet file is found, add it to the Parquet
        file.  Save the current data as json to serve as the prior in the next comparison.
        """

        # Get prior lift status info json
        try:
            self.lifts_prior_object.load()
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("Prior doesn't exist")
                # Create the prior file
                self.save_prior_data()
                print(f"Created {PRIOR_STATUS_FNAME}")
            else:
                # Something else has gone wrong.
                raise
        else:
            # The prior exists
            self.get_prior_data()
            if self.data_changed():

                # Get a df with the status chages between the prior and current json data
                df = jsons_to_df([self.lifts_prior_json, self.lifts_current_json])
                write_dataframe_to_parquet_on_s3(df, HISTORY_FNAME)

                # save current lift status info json as prior
                self.save_prior_data()
                print(
                    f"Replaced data in {self.lifts_prior_object.key} with current data.")

    def get_prior_data(self):
        lifts_prior = self.lifts_prior_object.get()[
            'Body'].read().decode('utf-8')
        self.lifts_prior_json = json.loads(lifts_prior)
        print("Loaded prior json data from S3")

    def data_changed(self):
        """Compare current data json with prior data json without their timestamps.  The timestamps
        on the current json will always be more recent even when none of the lift statuses have changed.
        """
        if self.lifts_prior_json['lifts'] == self.lifts_current_json['lifts']:
            print("No differences between current and prior data were found.")
            return False
        else:
            print("Found differences between current and prior data.")
            return True

    def save_prior_data(self):
        """Save the current data as prior data on S3."""
        bucket.put_object(Key=PRIOR_STATUS_FNAME,
                          Body=bytes(json.dumps(self.lifts_current_json).encode('UTF-8')))

In [117]:
ParquetWriter().write_new_data()

Loaded prior json data from S3
Found differences between current and prior data.
File wb_lifts_history.parquet found.


  index_cols = [{'name': index_cols.name, 'start': index_cols._start,
  'stop': index_cols._stop, 'step': index_cols._step,
  'stop': index_cols._stop, 'step': index_cols._step,


Writing 14 records to wb_lifts_history.parquet.
Replaced data in lifts_prior.json with current data.


**Warnings**

See https://github.com/dask/fastparquet/issues/477 for fastparquet warnings about `RangeIndex._start, RangeIndex._stop, RangeIndex._step`


    /Users/paul/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:90: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:655: FutureWarning: RangeIndex._start is deprecated and will be removed in a future version. Use RangeIndex.start instead
      index_cols = [{'name': index_cols.name, 'start': index_cols._start,
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:656: FutureWarning: RangeIndex._stop is deprecated and will be removed in a future version. Use RangeIndex.stop instead
      'stop': index_cols._stop, 'step': index_cols._step,
    /Users/paul/anaconda3/lib/python3.7/site-packages/fastparquet/writer.py:656: FutureWarning: RangeIndex._step is deprecated and will be removed in a future version. Use RangeIndex.step instead
      'stop': index_cols._stop, 'step': index_cols._step,

In [375]:
%time parq_df = load_dataframe_from_parquet_on_s3(HISTORY_FNAME)

lifts_status_changes_parq_df = get_status_durations(parq_df)
lifts_status_changes_parq_df.to_csv(DATA_DIR + "lifts_status_changes_parq.csv", date_format='%c')
lifts_status_changes_parq_df

CPU times: user 267 ms, sys: 11.4 ms, total: 279 ms
Wall time: 2.34 s


Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp,time_diff,time_diff_seconds
0,3,13,7th Heaven Express,O,6,2020-02-03 14:21:57.064742-08:00,00:47:48.055160,2868.055160
29,3,13,7th Heaven Express,X,6,2020-02-03 15:09:45.119902-08:00,18:11:29.035149,65489.035149
67,3,13,7th Heaven Express,O,6,2020-02-04 09:21:14.155051-08:00,05:49:50.203591,20990.203591
79,3,13,7th Heaven Express,X,6,2020-02-04 15:11:04.358642-08:00,18:39:59.624647,67199.624647
140,3,13,7th Heaven Express,O,6,2020-02-05 09:51:03.983289-08:00,05:19:59.834930,19199.834930
...,...,...,...,...,...,...,...,...
66,72,13,Whistler Village Gondola Upper,O,11,2020-02-04 09:11:14.369598-08:00,06:29:50.017407,23390.017407
98,72,13,Whistler Village Gondola Upper,X,11,2020-02-04 15:41:04.387005-08:00,16:09:59.992999,58199.992999
118,72,13,Whistler Village Gondola Upper,H,11,2020-02-05 07:51:04.380004-08:00,00:29:59.550732,1799.550732
130,72,13,Whistler Village Gondola Upper,O,11,2020-02-05 08:21:03.930736-08:00,07:19:59.706996,26399.706996


In [124]:
parq_df.status.cat.categories

Index(['X', 'H', 'O'], dtype='object')

In [125]:
print(*parq_df.status)

O O O O O O O O O O O O X X O O O X O O O X O O O X O O O X X X X X X X X X X X X X X X X X X X X O O O O O O O O O O O O O O O O O O O O O O O O O X O


In [186]:
parq_df.sort_values(["liftName", "timestamp"])

Unnamed: 0,liftID,resortID,liftName,status,timeToRide,timestamp
0,3,13,7th Heaven Express,O,6,2020-02-03 14:21:57.064742-08:00
29,3,13,7th Heaven Express,X,6,2020-02-03 15:09:45.119902-08:00
67,3,13,7th Heaven Express,O,6,2020-02-04 09:21:14.155051-08:00
1,36,13,Big Red Express,O,8,2020-02-03 14:21:57.064742-08:00
35,36,13,Big Red Express,X,8,2020-02-03 15:37:30.556825-08:00
...,...,...,...,...,...,...
47,33,13,Whistler Village Gondola Lower,X,5,2020-02-03 15:37:30.556825-08:00
65,33,13,Whistler Village Gondola Lower,O,5,2020-02-04 09:11:14.369598-08:00
28,72,13,Whistler Village Gondola Upper,O,11,2020-02-03 14:21:57.064742-08:00
48,72,13,Whistler Village Gondola Upper,X,11,2020-02-03 15:37:30.556825-08:00


### Issue when running get_status_durations(parq_df)
Resulting in error:

    ~/anaconda3/lib/python3.7/site-packages/pandas/core/arrays/categorical.py in from_codes(cls, codes, categories, ordered, dtype)
        705 
        706         if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
    --> 707             raise ValueError("codes need to be between -1 and " "len(categories)-1")
        708 
        709         return cls(codes, dtype=dtype, fastpath=True)

    ValueError: codes need to be between -1 and len(categories)-1


Same error seen when running `parq_df[['status']].sort_values(by=['status'])`

This was caused by missing categories (`H`) in the `status` column (and maybe others)

#### Code to inspect issue:

In [228]:
# Test for issue
parq_df[['status']].sort_values(by=['status'])

Unnamed: 0,status
0,H
59,H
58,H
57,H
56,H
...,...
38,O
39,O
40,O
42,O


In [280]:
parq_df.status.cat.categories

Index(['X', 'H', 'O'], dtype='object')

In [281]:
print(*parq_df.status.cat.codes)

2 2 2 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 0 2 0 2 0 2 2 2


In [205]:
len(parq_df.status.cat.codes)

65

In [185]:
# Should be false
parq_df.status.cat.codes.max() >= len(parq_df.status.dtype.categories)

False

In [186]:
# Should be false
parq_df.liftName.cat.codes.min() < -1

False

In [None]:
for c in parq_df.columns:
    print(parq_df[c].cat.categories)

In [36]:
parq_df.dtypes

liftID                   category
resortID                 category
liftName                 category
status                   category
timeToRide                  int64
timestamp     datetime64[ns, UTC]
dtype: object

In [113]:
lifts_status_changes_df.dtypes

liftID                                      category
resortID                                    category
liftName                                    category
status                                      category
timeToRide                                     int64
timestamp     datetime64[ns, pytz.FixedOffset(-480)]
dtype: object

In [106]:
parq_df["timestamp"] = pd.to_datetime(pd.Series(np.asarray(parq_df["timestamp"])))

In [73]:
read_file = f"s3://{BUCKET_NAME}/{fname}.parquet"
pf = ParquetFile(read_file, open_with=myopen)

# Check the categories for a specific row group
pf.grab_cats(columns='status', row_group_index=1)

{'status': array(['H', 'O', 'X'], dtype=object)}

In [66]:
# If partitioning by column, gives known values for each column
pf.cats

{}

#### Possible solutions
1. Remove partitioning by date column when writing to parquet
**2. Set status categories manually via `set_categories`. (and any other columns with the same issue.  See https://github.com/dask/dask/issues/2944**
3. Leave problem columns as text-based when writing and loading from parquet

# Testing timestamps for file loading

In [9]:
read_file = f"s3://{BUCKET_NAME}/{fname}.parquet"
pf = ParquetFile(read_file, open_with=myopen)
test = pf.to_pandas()["timestamp"]

In [100]:
# If needed: to convert for categorical datetime to regular datetime
df["timestamp"] = pd.to_datetime(pd.Series(np.asarray(df["timestamp"])))

	To accept the future behavior, pass 'dtype=object'.
	To keep the old behavior, pass 'dtype="datetime64[ns]"'.
  


In [101]:
test.dt = test.dt.tz_convert(tz= 'America/Vancouver')

/Users/paul/anaconda3/lib/python3.7/site-packages/pandas/core/series.py:597: FutureWarning: Converting timezone-aware DatetimeArray to timezone-naive ndarray with 'datetime64[ns]' dtype. In the future, this will return an ndarray with 'object' dtype where each element is a 'pandas.Timestamp' with the correct 'tz'.
	To accept the future behavior, pass 'dtype=object'.
	To keep the old behavior, pass 'dtype="datetime64[ns]"'.


more info: https://pandas-docs.github.io/pandas-docs-travis/whatsnew/v0.24.0.html#converting-timezone-aware-series-and-index-to-numpy-arrays

In [102]:
load_dataframe_from_parquet_on_s3(fname).dtypes

liftID        category
resortID      category
liftName      category
status        category
timeToRide       int64
timestamp     category
dtype: object

### Testing local parquet saves

In [None]:
def save_parquet(df, fname):
    # parquet engines don't handle shifted timezones
    df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].dt.tz_convert(pytz.utc)

    # Note: May need snappy-python as a req to run on AWS Lambda
    df.to_parquet(DATA_DIR + fname + '.parquet',
                  engine='fastparquet',
                  partition_on=['timestamp'],
                  file_scheme='mixed')

In [91]:
save_parquet(df[0:3].copy(), 'wb_lifts_history')

In [92]:
df.iloc[20:22, :].copy().to_parquet(DATA_DIR + 'wb_lifts_history' + '.parquet',
              engine='fastparquet',
              partition_on=['timestamp'],
              file_scheme='mixed',
              append=True)
# Catch exception that is doesn't exist here

In [None]:
# todo: change time_diff to "duration"
# test on lambda
# make datatype dict for and general set datatypes function


# Utilities

In [118]:
def del_lifts_history():
    s3.Object(BUCKET_NAME, HISTORY_FNAME).delete()

def del_lifts_prior():
    s3.Object(BUCKET_NAME, PRIOR_STATUS_FNAME).delete()

In [121]:
del_lifts_history()
del_lifts_prior()

# Notes
- Terrain data: runs need to be identified via combination of `resortID`, `runName` and `runID`
- There are run IDs that repeat for the same resort (e.g. for Vail resortID == 1, runID == 10
- TBD: Are the combination of `resortID`, `runID`, and `runType` always unique?

## To do
- differentiate among the 3 diff pieces of API data by 'subject'
- option to get data for specific subject(s)
- return data object with filter_by_resort() method
- live_data class?  Subclass for each subject?