### Import Libraries

The first step is to import the libraries that we will need.

In [None]:
import json
import os
import datetime

import wallaroo
from wallaroo.object import EntityNotFoundError

# used to display dataframe information without truncating
from IPython.display import display
import pandas as pd
import numpy as np

from resources import simdb
from resources import util

pd.set_option('display.max_colwidth', None)


### Initialize connection

Start a connect to the Wallaroo instance and save the connection into the variable `wl`.

In [None]:
# Login through local Wallaroo instance

wl = wallaroo.Client()

wallarooPrefix = "doc-test"
wallarooSuffix = "wallaroocommunity.ninja"

wl = wallaroo.Client(api_endpoint=f"https://{wallarooPrefix}.api.{wallarooSuffix}", 
                    auth_endpoint=f"https://{wallarooPrefix}.keycloak.{wallarooSuffix}", 
                    auth_type="sso")

### Set Configurations

The following will set the workspace, model name, and pipeline that will be used for this example.  If the workspace or pipeline already exist, then they will assigned for use in this example.  If they do not exist, they will be created based on the names listed below.

In [None]:
workspace_name = 'moto2'
pipeline_name = 'bikedaypipe'
model_name = 'bikedaymodel'

## Set the Workspace and Pipeline


In [17]:
def get_workspace(name):
    workspace = None
    for ws in wl.list_workspaces():
        if ws.name() == name:
            workspace= ws
    if(workspace == None):
        workspace = wl.create_workspace(name)
    return workspace

def get_pipeline(name):
    try:
        pipeline = wl.pipelines_by_name(name)[0]
    except EntityNotFoundError:
        pipeline = wl.build_pipeline(name)
    return pipeline

workspace = get_workspace(workspace_name)

wl.set_current_workspace(workspace)

pipeline = get_pipeline(pipeline_name)



WaitForDeployError: Deployment did not finish within 45s.
Status: None

In [21]:
# Set the deployment to allow for additional engines to run
deploy_config = (wallaroo.DeploymentConfigBuilder()
                        .replica_count(4)
                        .build()
                    )


pipeline.deploy(deployment_config=deploy_config)

0,1
name,bikedaypipe
created,2023-05-18 18:55:59.011989+00:00
last_updated,2023-05-18 20:15:57.744801+00:00
deployed,True
tags,
versions,"2b9c771d-6c7b-4618-ac1b-c18ac4d01f9e, d1be9e11-4c80-405b-9fde-c8d3a6737736, cc55d2e4-bee0-4865-8bcb-2c54bd21a820, 826a7a0a-449e-475e-ab01-816fcb7ff488, af2cf9d4-1861-4771-a3f0-5bbc5b139c64, 8bd2604a-ea82-4890-80d9-051c9f4e07bd, f1b8b382-5286-4c6f-b452-42c808f65801"
steps,bikedaymodel


### Run Inference
For this example, we will forecast bike rentals for the following seven days, based on the previous month's rentals (inclusive of "today").

In [22]:
# retrieve forecast schedule
first_day, analysis_days = util.get_forecast_days()

print(f'Running analysis on {first_day}')

Running analysis on 2011-02-22


In [23]:
# connect to SQL data base 
conn = simdb.get_db_connection()
print(f'Bike rentals table: {simdb.tablename}')

# create the query and retrieve data
query = util.mk_dt_range_query(tablename=simdb.tablename, forecast_day=first_day)
print(query)
data = pd.read_sql_query(query, conn)
data.head()

Bike rentals table: bikerentals
select cnt from bikerentals where date > DATE(DATE('2011-02-22'), '-1 month') AND date <= DATE('2011-02-22')


Unnamed: 0,cnt
0,986
1,1416
2,1985
3,506
4,431


In [24]:
pd.read_sql_query("select date, cnt from bikerentals where date > DATE(DATE('2011-02-22'), '-1 month') AND date <= DATE('2011-02-22') LIMIT 5", conn)

Unnamed: 0,date,cnt
0,2011-01-23,986
1,2011-01-24,1416
2,2011-01-25,1985
3,2011-01-26,506
4,2011-01-27,431


In [29]:
# send data to model for forecast

results = pipeline.infer(data.to_dict(orient='list'))[0]
results


{'forecast': [1462, 1483, 1497, 1507, 1513, 1518, 1521]}

In [32]:
# annotate with the appropriate dates (the next seven days)
resultframe = pd.DataFrame({
    'date' : util.get_forecast_dates(first_day),
    'forecast' : results['forecast']
})

# write the new data to the db table "bikeforecast"
resultframe.to_sql('bikeforecast', conn, index=False, if_exists='append')

# display the db table
query = "select date, forecast from bikeforecast"
pd.read_sql_query(query, conn)

Unnamed: 0,date,forecast
0,2011-02-23,1462
1,2011-02-24,1462
2,2011-02-25,1462
3,2011-02-26,1462
4,2011-02-27,1462
5,2011-02-28,1462
6,2011-03-01,1462
7,2011-02-23,1462
8,2011-02-24,1483
9,2011-02-25,1497


Normally here we would close the database connection and undeploy the pipeline until the next forecast run.

# Four weeks of inferences
We'll do it all in a loop, to get inferences for the entire month of March.

In [None]:
import asyncio
import httpx
import collections
import json

async def parallel_infer(pipe, dataset, timeout_secs, num_parallel):
    """ Runs inference on a list of data in parallel and returns a list of results.
        pipe:         The pipeline to use for inference
        dataset:      A list of data objects to call pipe.infer() with. Can be dataframes, Python dictionary, or Arrow table
        timeout:      How long to wait for an inference result before timing out. Should be 2-3x the longest inference time to allow for queuing.
        num_parallel: The number of jobs to submit in parallel, should be equal to the number of replicas, or up to 2x
    """
    # Limit the concurrency so we don't run into issues with connections timing out due to
    # submitting 1000's of jobs per replica
    semaphore = asyncio.Semaphore(num_parallel)

    print(f"Parallel inference: {len(dataset)} jobs to {pipe.name()}")
    # We need to know what kind of data we're working with.  Maybe we just use dataframes for now. 


    headers = {"Content-Type": "application/json"}
    async def infer(client, idx, url, df):
        
        resp = None
        async with semaphore:
            print(f"Submitting {idx}")
            now = datetime.datetime.now()
            #js = json.dumps(df)
            nowwow = datetime.datetime.now()
            resp = await pipeline.infer(df)
            # resp = await client.post(url, headers=headers, data=js)
            # if resp.status_code != 200:
            #     print(f"Retrying #{idx}: status code={resp.status_code} {resp.text}")
            #     resp = await client.post(url, headers=headers, data=df)
            #     if resp.status_code != 200:
            #         print(f"Job #{idx}: failed after two tries")
            #         return None

            bow = datetime.datetime.now()
            #js = json.loads(resp.text)[0]
            #dct = js["outputs"][0]["Json"]["data"][0]
            bowwow = datetime.datetime.now()
        
            #print(f"Job #{idx}: complete {(nowwow - now).total_seconds()} {(bowwow - bow).total_seconds()} {len(resp)} {(datetime.datetime.now() - now).total_seconds()} {js['elapsed']/1000000000}")
            return resp
            # return pd.DataFrame.from_dict(dct)

    results = []
    async with httpx.AsyncClient(timeout=timeout_secs) as client:
        results = await asyncio.gather(*[infer(client, idx, url, df) for idx, df in enumerate(dataset)])

    return pd.concat((r for r in results if not None))

In [None]:
# get our list of items to run through

inference_data = []

for day in analysis_days:
    print(f"Current date: {day}")
    query = util.mk_dt_range_query(tablename=simdb.tablename, forecast_day=day)
    print(query)
    data = pd.read_sql_query(query, conn)
    inference_data.append(data)

    # # send data to model for forecast
    # results = pipeline.infer(data.to_dict(orient='list'))[0]
    
    # # annotate with the appropriate dates (the next seven days)
    # resultframe = pd.DataFrame({
    #     'date' : util.get_forecast_dates(day),
    #     'forecast' : results['forecast']
    # })
    
    # # write the new data to the db table "bikeforecast"
    # resultframe.to_sql('bikeforecast', conn, index=False, if_exists='append')
display(inference_data)

On April 1st, we can compare March forecasts to actuals

In [None]:
query = f'''SELECT bikeforecast.date AS date, forecast, cnt AS actual
            FROM bikeforecast LEFT JOIN bikerentals
            ON bikeforecast.date = bikerentals.date
            WHERE bikeforecast.date >= DATE('2011-03-01')
            AND bikeforecast.date <  DATE('2011-04-01')
            ORDER BY 1'''

print(query)


comparison = pd.read_sql_query(query, conn)
comparison

### Undeploy the Pipeline

Undeploy the pipeline and return the resources back to the Wallaroo instance.

In [None]:
conn.close()
pipeline.undeploy()