# Creating and scheduling the forecasting task

In [1]:
import json
import os
import datetime

import wallaroo
from wallaroo.object import EntityNotFoundError

# used to display dataframe information without truncating
from IPython.display import display
import pandas as pd
import numpy as np

import time

# for Big Query connections
from google.cloud import bigquery
from google.oauth2 import service_account
import db_dtypes

# utility functions for creating demo queries
from resources import util

pd.set_option('display.max_colwidth', None)

#
# convenience functions
#

def get_workspace(name):
    workspace = None
    for ws in wl.list_workspaces():
        if ws.name() == name:
            workspace= ws
    if(workspace == None):
        workspace = wl.create_workspace(name)
    return workspace

# get a pipeline by name in the workspace
def get_pipeline(pname, create_if_absent=False):
    plist = wl.get_current_workspace().pipelines()
    pipeline = [p for p in plist if p.name() == pname]
    if len(pipeline) <= 0:
        if create_if_absent:
            pipeline = wl.build_pipeline(pname)
        else:
            raise KeyError(f"pipeline {pname} not found in this workspace")
    else:
        pipeline = pipeline[0]
    return pipeline

# FOR THE PURPOSES OF THE DEMO, recreate a blank staging table
def create_staging_table(bqclient, tablename):
    fulltablename = f'{bqclient.project}.{tablename}'
    schema = [
        bigquery.SchemaField("dteday", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("site_id", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("forecast", "INTEGER", mode="REQUIRED")
    ]
    table = bigquery.Table(fulltablename, schema=schema)
    table = bqclient.create_table(table) 

## Preliminaries

* create and test the execution script `main.py`
* package up the script and necessary artifacts for creating the orchestration

In [2]:
# zip up the script and artifacts

import zipfile

files_to_include = [
    'main.py', # execution script
    'requirements.txt', # required if you have additional package dependencies beyond what's included in wallaroo enivironment
    'resources/util.py' # a utility package I've been using for this project ("from resources import util")
]

zipfile_name = 'bike_orch.zip'

with zipfile.ZipFile(zipfile_name, mode='w') as archive:
    for filename in files_to_include:
        archive.write(filename)
        
# verify the contents
with zipfile.ZipFile(zipfile_name, mode='r') as archive:
    archive.printdir()



File Name                                             Modified             Size
main.py                                        2023-06-20 16:52:00         5567
requirements.txt                               2023-06-14 11:57:12           66
resources/util.py                              2023-06-13 17:26:26         1767


## Create the orchestration

In [3]:
wl = wallaroo.Client()

# name is optional, but useful
orchestration = wl.upload_orchestration(name='bike-forecast', path=f'./{zipfile_name}')

# wait for it to upload
while orchestration.status() != 'ready':
    print(orchestration.status())
    time.sleep(5)

pending_packaging
packaging
packaging
packaging
packaging
packaging
packaging
packaging
packaging
packaging
packaging
packaging
packaging


In [4]:
orchestration

Field,Value
ID,29d9d3d2-21b8-4697-ae24-e6d02615f1db
Name,bike-forecast
File Name,bike_orch.zip
SHA,2a4af9526a7985d45037875574490f8f3e20b4b33362670f9b363a361c8e79c4
Status,ready
Created At,2023-20-Jun 16:57:44
Updated At,2023-20-Jun 16:58:54


In [5]:
f'{orchestration._name}: id {orchestration._id}'

'bike-forecast: id 29d9d3d2-21b8-4697-ae24-e6d02615f1db'

In [6]:
wl.list_orchestrations()

id,name,status,filename,sha,created at,updated at
29d9d3d2-21b8-4697-ae24-e6d02615f1db,bike-forecast,ready,bike_orch.zip,2a4af9...8e79c4,2023-20-Jun 16:57:44,2023-20-Jun 16:58:54


# Use the orchestration to execute a one-off task

In [7]:
# DEMO PRELIMINARY - drop and re-create the staging table
connection = wl.get_connection('bq-wl-dev')

bigquery_credentials = service_account.Credentials.from_service_account_info(connection.details())
bigqueryclient = bigquery.Client(
    credentials=bigquery_credentials, 
    project=connection.details()['project_id']
)

out_tablename = 'bikerental_forecast_demo.bikeforecasts'

# drop the table if it already exists
bigqueryclient.delete_table(out_tablename, not_found_ok=True)
create_staging_table(bigqueryclient, out_tablename)

In [8]:
# arguments to pass to the task 
args = {
    'conn_name': 'bq-wl-dev',
    'workspace_name' : 'bikerental-nbz',
    'pipeline_name' : 'bikeforecast-pipe',
    'dataset': 'bikerental_forecast_demo',
    'input_table' : 'bikerentals',
    'output_table': 'bikeforecast'
}


# create task
# since I have assigned defaults in the execution script, json_args is optional
task = orchestration.run_once(name="bike forecast ad-hoc", json_args=args)
task

Field,Value
ID,bfdfd64e-f8a3-42ea-b46e-04a0ec82c353
Name,bike forecast ad-hoc
Last Run Status,unknown
Type,Temporary Run
Active,True
Schedule,-
Created At,2023-20-Jun 17:03:52
Updated At,2023-20-Jun 17:03:52


In [9]:
# wait for the task to start
while task.status() != "started":
    display(task.status())
    time.sleep(5)

'pending'

In [10]:
wl.list_tasks()

id,name,last run status,type,active,schedule,created at,updated at
bfdfd64e-f8a3-42ea-b46e-04a0ec82c353,bike forecast ad-hoc,running,Temporary Run,True,-,2023-20-Jun 17:03:52,2023-20-Jun 17:03:58


In [13]:
f'{task._name}: id {task._id}'

'bike forecast ad-hoc: id bfdfd64e-f8a3-42ea-b46e-04a0ec82c353'

In [11]:
# wait 90 seconds, then look at the task run
time.sleep(90)
task.last_runs()

task id,pod id,status,created at,updated at
bfdfd64e-f8a3-42ea-b46e-04a0ec82c353,ab3590de-0599-44a7-9bb9-91d78b3a4ec6,success,2023-20-Jun 17:03:54,2023-20-Jun 17:03:54


In [16]:
# look at the logs of the most recent run
task.last_runs()[0].logs()

ReadTimeout: timed out

In [17]:
# verify the results of the run
query = f'''SELECT * from {out_tablename} ORDER BY site_id, dteday'''
print(query)

bigqueryclient.query(query).to_dataframe()

SELECT * from bikerental_forecast_demo.bikeforecasts ORDER BY site_id, dteday


Unnamed: 0,dteday,site_id,forecast
0,2011-03-02,site0001,2269
1,2011-03-03,site0001,1712
2,2011-03-04,site0001,1795
3,2011-03-05,site0001,1371
4,2011-03-06,site0001,1819
...,...,...,...
65,2011-03-04,site0010,1717
66,2011-03-05,site0010,1400
67,2011-03-06,site0010,1997
68,2011-03-07,site0010,2117


In [18]:
wl.list_tasks()

ReadTimeout: timed out

In [None]:
# clean the task off the list just for neatness sake
task.kill()
time.sleep(5)
wl.list_tasks()

# Scheduling a task

Here, we'll use the orchestration to schedule this task every 5 minutes.
Scheduling is via `cron` format. Ssee [chrontab guru](https://crontab.guru/) for help creating `cron` expressions.

In [None]:
# this time we'll do it without passing parameters, just to prove we can

scheduled_task = orchestration.run_scheduled(name="bike forecast 1 min", 
                                             schedule="*/1 * * * *", 
                                             timeout=120,
                                             # json_args = args
                                            )

In [None]:
while scheduled_task.status() != "started":
    display(scheduled_task.status())
    time.sleep(5)

In [None]:
wl.list_tasks()

In [None]:
# give it 90 seconds, then check
time.sleep(90)
scheduled_task.last_runs()

In [None]:
scheduled_task.last_runs()

In [None]:
scheduled_task.kill()

In [None]:
wl.list_tasks()

### Clean up

In [19]:
# fetch the pipeline and make sure it's undeployed
wl.set_current_workspace(get_workspace(args['workspace_name']))

pipeline = get_pipeline(args['pipeline_name'])
pipeline.undeploy()


ReadTimeout: timed out

In [20]:
# drop staging table and re-create empty
bigqueryclient.delete_table(out_tablename, not_found_ok=True)
create_staging_table(bigqueryclient, out_tablename)