### Import Libraries

The first step is to import the libraries that we will need.

In [1]:
import json
import os
import datetime

import wallaroo
from wallaroo.object import EntityNotFoundError

# used to display dataframe information without truncating
from IPython.display import display
import pandas as pd
import numpy as np

from resources import simdb
from resources import util

pd.set_option('display.max_colwidth', None)


### Initialize connection

Start a connect to the Wallaroo instance and save the connection into the variable `wl`.

In [2]:
# Login through local Wallaroo instance

wl = wallaroo.Client()

wallarooPrefix = "doc-test"
wallarooSuffix = "wallaroocommunity.ninja"

wl = wallaroo.Client(api_endpoint=f"https://{wallarooPrefix}.api.{wallarooSuffix}", 
                    auth_endpoint=f"https://{wallarooPrefix}.keycloak.{wallarooSuffix}", 
                    auth_type="sso")

Please log into the following URL in a web browser:

	https://doc-test.keycloak.wallaroocommunity.ninja/auth/realms/master/device?user_code=OAOE-LNZA

Login successful!


### Set Configurations

The following will set the workspace, model name, and pipeline that will be used for this example.  If the workspace or pipeline already exist, then they will assigned for use in this example.  If they do not exist, they will be created based on the names listed below.

In [3]:
workspace_name = 'moto2'
pipeline_name = 'bikedaypipe'
model_name = 'bikedaymodel'

## Set the Workspace and Pipeline


In [4]:
def get_workspace(name):
    workspace = None
    for ws in wl.list_workspaces():
        if ws.name() == name:
            workspace= ws
    if(workspace == None):
        workspace = wl.create_workspace(name)
    return workspace

def get_pipeline(name):
    try:
        pipeline = wl.pipelines_by_name(name)[0]
    except EntityNotFoundError:
        pipeline = wl.build_pipeline(name)
    return pipeline

workspace = get_workspace(workspace_name)

wl.set_current_workspace(workspace)

pipeline = get_pipeline(pipeline_name)



In [40]:
model_file_name = 'forecast.py'

bike_day_model = wl.upload_model(model_name, model_file_name).configure(runtime="python")

In [41]:
pipeline.add_model_step(bike_day_model)

0,1
name,bikedaypipe
created,2023-05-22 17:16:59.400429+00:00
last_updated,2023-05-22 17:38:08.964945+00:00
deployed,True
tags,
versions,"6aede63f-cf29-4cb1-9cd4-2c5548a11357, a061b5a3-6ed3-4e90-b950-90d7bb9e57d1, 441faa7f-1957-4bf0-b4fe-eaa79a671fbe, 52e355a8-43a5-4a23-9aec-320e0d7b6f36, 05a5bad1-878b-43d1-b81b-59874111cad8"
steps,bikedaymodel


In [42]:
# Set the deployment to allow for additional engines to run
deploy_config = (wallaroo.DeploymentConfigBuilder()
                        .replica_count(4)
                        .build()
                    )

pipeline.deploy(deployment_config = deploy_config)

Exception: Failed to insert model configs

### Run Inference
For this example, we will forecast bike rentals for the following seven days, based on the previous month's rentals (inclusive of "today").

In [9]:
# retrieve forecast schedule
first_day, analysis_days = util.get_forecast_days()

print(f'Running analysis on {first_day}')

Running analysis on 2011-02-22


In [29]:
# connect to SQL data base 
conn = simdb.get_db_connection()
print(f'Bike rentals table: {simdb.tablename}')

# create the query and retrieve data
query = util.mk_dt_range_query(tablename=simdb.tablename, forecast_day=first_day)
print(query)
data = pd.read_sql_query(query, conn)
data.head()

Bike rentals table: bikerentals
select cnt from bikerentals where date > DATE(DATE('2011-02-22'), '-1 month') AND date <= DATE('2011-02-22')


Unnamed: 0,cnt
0,986
1,1416
2,1985
3,506
4,431


In [31]:
pd.read_sql_query("select date, cnt from bikerentals where date > DATE(DATE('2011-02-22'), '-1 month') AND date <= DATE('2011-02-22') LIMIT 5", conn)

Unnamed: 0,date,cnt
0,2011-01-23,986
1,2011-01-24,1416
2,2011-01-25,1985
3,2011-01-26,506
4,2011-01-27,431


In [32]:
# send data to model for forecast

results = pipeline.infer(data.to_dict(orient='list'))[0]
results


RuntimeError: Inference did not return within 15s, adjust if necessary

In [13]:
# annotate with the appropriate dates (the next seven days)
resultframe = pd.DataFrame({
    'date' : util.get_forecast_dates(first_day),
    'forecast' : results['forecast']
})

# write the new data to the db table "bikeforecast"
resultframe.to_sql('bikeforecast', conn, index=False, if_exists='append')

# display the db table
query = "select date, forecast from bikeforecast"
pd.read_sql_query(query, conn)

Unnamed: 0,date,forecast
0,2011-02-23,1462
1,2011-02-24,1483
2,2011-02-25,1497
3,2011-02-26,1507
4,2011-02-27,1513
5,2011-02-28,1518
6,2011-03-01,1521


In [15]:
pipeline.url()

'https://doc-test.api.wallaroocommunity.ninja/v1/api/pipelines/infer/bikedaypipe-12/bikedaypipe'

Normally here we would close the database connection and undeploy the pipeline until the next forecast run.

# Four weeks of inferences
We'll do it all in a loop, to get inferences for the entire month of March.

In [27]:
import asyncio
import httpx
import collections
import json

async def parallel_infer(pipe, dataset, content_type, timeout_secs, num_parallel):
    """ Runs inference on a list of data in parallel and returns a list of results.
        pipe:         The pipeline to use for inference
        dataset:      A list of data objects to call pipe.infer() with. Can be dataframes, Python dictionary, or Arrow table
        timeout:      How long to wait for an inference result before timing out. Should be 2-3x the longest inference time to allow for queuing.
        num_parallel: The number of jobs to submit in parallel, should be equal to the number of replicas, or up to 2x
    """
    # Limit the concurrency so we don't run into issues with connections timing out due to
    # submitting 1000's of jobs per replica
    semaphore = asyncio.Semaphore(num_parallel)
    url = pipe.url()
    print(f"Parallel inference: {len(dataset)} jobs to {url}")
    headers = wl.auth.auth_header()
    # set Content-Type type
    headers['Content-Type']=content_type
    print(headers)
    async def infer(client, idx, url, df):
        resp = None
        async with semaphore:
            print(f"Submitted {idx}")
            now = datetime.datetime.now()
            # set the data type depending on the content type
            if content_type == "application/json":
                js = json.dumps(df)
            if content_type == "application/json; format=pandas-records":
                js = df.to_json()
            print(js)
            nowwow = datetime.datetime.now()
            resp = await client.post(url, headers=headers, data=js)
            if resp.status_code != 200:
                print(f"Retrying #{idx}: status code={resp.status_code} {resp.text}")
                resp = await client.post(url, headers=headers, data=df)
                if resp.status_code != 200:
                    print(f"Job #{idx}: failed after two tries")
                    return None

            bow = datetime.datetime.now()
            js = resp.json()
            print(js)
            # dct = js["outputs"][0]["Json"]["data"][0]
            bowwow = datetime.datetime.now()
        
            print(f"Job #{idx}: complete {(nowwow - now).total_seconds()} {(bowwow - bow).total_seconds()} {len(resp.text)} {(datetime.datetime.now() - now).total_seconds()} {js['elapsed']/1000000000}")
            
            if content_type == "application/json":
                return pd.DataFrame.from_dict(js)
            if content_type == "application/json; format=pandas-records":
                return pd.DataFrame.from_records(js)

    results = []
    async with httpx.AsyncClient(timeout=timeout_secs) as client:
        results = await asyncio.gather(*[infer(client, idx, url, df) for idx, df in enumerate(dataset)])

    return pd.concat((r for r in results if not None))

In [23]:
datasource = data.to_dict(orient='list')

display(datasource)

{'cnt': [986,
  1416,
  1985,
  506,
  431,
  1167,
  1098,
  1096,
  1501,
  1360,
  1526,
  1550,
  1708,
  1005,
  1623,
  1712,
  1530,
  1605,
  1538,
  1746,
  1472,
  1589,
  1913,
  1815,
  2115,
  2475,
  2927,
  1635,
  1812,
  1107,
  1450]}

In [28]:
await parallel_infer(pipeline, datasource, "application/json", "120", 8)

Parallel inference: 1 jobs to https://doc-test.api.wallaroocommunity.ninja/v1/api/pipelines/infer/bikedaypipe-12/bikedaypipe
{'Authorization': 'Bearer eyJhbGciOiJSUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJnS1IzWHRTQnlhampFc3hTMlNyVkNlbEZ1Zm1Jdi1mZlNyWV9rNWdLLVBrIn0.eyJleHAiOjE2ODQ3NzY4MDIsImlhdCI6MTY4NDc3Njc0MiwiYXV0aF90aW1lIjoxNjg0Nzc1ODEyLCJqdGkiOiJiMzI2NWRkZC1kZjRlLTQ3NjgtYThlZi0wODk2YWMwZDE3MGIiLCJpc3MiOiJodHRwczovL2RvYy10ZXN0LmtleWNsb2FrLndhbGxhcm9vY29tbXVuaXR5Lm5pbmphL2F1dGgvcmVhbG1zL21hc3RlciIsImF1ZCI6WyJtYXN0ZXItcmVhbG0iLCJhY2NvdW50Il0sInN1YiI6ImJlYjYzOWVmLTEwZTktNGEwNi1iYjIwLWFlMzM1NzZjMjMzNCIsInR5cCI6IkJlYXJlciIsImF6cCI6InNkay1jbGllbnQiLCJzZXNzaW9uX3N0YXRlIjoiZjg5M2YwYzQtYmVkYS00NjAyLWE4MjktMjdjYzc0ZTE2NWY5IiwiYWNyIjoiMSIsInJlYWxtX2FjY2VzcyI6eyJyb2xlcyI6WyJkZWZhdWx0LXJvbGVzLW1hc3RlciIsIm9mZmxpbmVfYWNjZXNzIiwidW1hX2F1dGhvcml6YXRpb24iXX0sInJlc291cmNlX2FjY2VzcyI6eyJtYXN0ZXItcmVhbG0iOnsicm9sZXMiOlsibWFuYWdlLXVzZXJzIiwidmlldy11c2VycyIsInF1ZXJ5LWdyb3VwcyIsInF1ZXJ5LXVzZXJzIl19LCJhY2Nvd

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [38]:
# get our list of items to run through

# inference_data = []

content_type = "application/json"

for day in analysis_days:
    print(f"Current date: {day}")
    query = util.mk_dt_range_query(tablename=simdb.tablename, forecast_day=day)
    print(query)
    data = pd.read_sql_query(query, conn)

    # send data to model for forecast
    results = pipeline.infer(data.to_dict(orient='list'))[0]
    
    # annotate with the appropriate dates (the next seven days)
    resultframe = pd.DataFrame({
        'date' : util.get_forecast_dates(day),
        'forecast' : results['forecast']
    })
    
    # write the new data to the db table "bikeforecast"
    resultframe.to_sql('bikeforecast', conn, index=False, if_exists='append')

Current date: 2011-03-01
select cnt from bikerentals where date > DATE(DATE('2011-03-01'), '-1 month') AND date <= DATE('2011-03-01')
Current date: 2011-03-08
select cnt from bikerentals where date > DATE(DATE('2011-03-08'), '-1 month') AND date <= DATE('2011-03-08')
Current date: 2011-03-15
select cnt from bikerentals where date > DATE(DATE('2011-03-15'), '-1 month') AND date <= DATE('2011-03-15')
Current date: 2011-03-22
select cnt from bikerentals where date > DATE(DATE('2011-03-22'), '-1 month') AND date <= DATE('2011-03-22')
Current date: 2011-03-29
select cnt from bikerentals where date > DATE(DATE('2011-03-29'), '-1 month') AND date <= DATE('2011-03-29')


On April 1st, we can compare March forecasts to actuals

In [39]:
query = f'''SELECT bikeforecast.date AS date, forecast, cnt AS actual
            FROM bikeforecast LEFT JOIN bikerentals
            ON bikeforecast.date = bikerentals.date
            WHERE bikeforecast.date >= DATE('2011-03-01')
            AND bikeforecast.date <  DATE('2011-04-01')
            ORDER BY 1'''

print(query)


comparison = pd.read_sql_query(query, conn)
comparison

SELECT bikeforecast.date AS date, forecast, cnt AS actual
            FROM bikeforecast LEFT JOIN bikerentals
            ON bikeforecast.date = bikerentals.date
            WHERE bikeforecast.date >= DATE('2011-03-01')
            AND bikeforecast.date <  DATE('2011-04-01')
            ORDER BY 1


Unnamed: 0,date,forecast,actual
0,2011-03-01,1462,1851
1,2011-03-01,1521,1851
2,2011-03-01,1521,1851
3,2011-03-01,1521,1851
4,2011-03-01,1521,1851
5,2011-03-02,1764,2134
6,2011-03-03,1749,1685
7,2011-03-04,1743,1944
8,2011-03-05,1741,2077
9,2011-03-06,1740,605


### Undeploy the Pipeline

Undeploy the pipeline and return the resources back to the Wallaroo instance.

In [40]:
conn.close()
pipeline.undeploy()

0,1
name,bikedaypipe
created,2023-05-18 18:55:59.011989+00:00
last_updated,2023-05-18 20:55:35.618000+00:00
deployed,False
tags,
versions,"1e7499cb-ea61-4836-99bc-11d4693611cb, 2b9c771d-6c7b-4618-ac1b-c18ac4d01f9e, d1be9e11-4c80-405b-9fde-c8d3a6737736, cc55d2e4-bee0-4865-8bcb-2c54bd21a820, 826a7a0a-449e-475e-ab01-816fcb7ff488, af2cf9d4-1861-4771-a3f0-5bbc5b139c64, 8bd2604a-ea82-4890-80d9-051c9f4e07bd, f1b8b382-5286-4c6f-b452-42c808f65801"
steps,bikedaymodel
