# Create connection

Create a BQ connection, add it to wallaroo

In [1]:
import wallaroo
from wallaroo.object import EntityNotFoundError, RequiredAttributeMissing

# to display dataframe tables
from IPython.display import display
# used to display dataframe information without truncating
import pandas as pd
pd.set_option('display.max_colwidth', None)
import pyarrow as pa

import time
import json

# for Big Query connections
from google.cloud import bigquery
from google.oauth2 import service_account
import db_dtypes


In [2]:
wallaroo.__version__

'2023.2.0+dfca0605e'

### Connect to the Wallaroo Instance


In [3]:
# Login through local Wallaroo instance

wl = wallaroo.Client()

Please log into the following URL in a web browser:

	https://product-uat-ee.keycloak.wallaroocommunity.ninja/auth/realms/master/device?user_code=YDJS-TKLA

Login successful!


## Variable Declaration

The following variables will be used for our big query testing.  

We'll create and use the connection `bq_connection`

Not that for the connection arguments, we'll retrieve the information from the files `bigquery_service_account.json`, which includes the  [service account key file(SAK)](https://cloud.google.com/bigquery/docs/authentication/service-account-file) information.

| Field | Included in SAK | 
|---|---|
| type | √ | 
| project_id | √ |
| private_key_id | √ |
| private_key | √ |
| client_email | √ |
| auth_uri | √ |
| token_uri | √ |
| auth_provider_x509_cert_url | √ |
| client_x509_cert_url | √ |
| database | 🚫 |
| table | 🚫 |


### Helper Method

The following helper method will create a new connection, or return it if it already exists

In [4]:
# helper methods 
def get_connection(name, connection_type, connection_arguments):
    try:
        connection = wl.get_connection(name)
    except RequiredAttributeMissing:
        connection =wl.create_connection(name, 
                  connection_type, 
                  connection_arguments)
    return connection


## Create Connections

We will create the data source connection via the Wallaroo client command `create_connection`.

Connections are created with the Wallaroo client command [`create_connection`](https://staging.docs.wallaroo.ai/wallaroo-developer-guides/wallaroo-sdk-guides/wallaroo-sdk-essentials-guide/wallaroo-sdk-essentials-ml-workload-orchestration/#create-orchestration) with the following parameters.

| Parameter | Type | Description |
| --- | --- | ---|
| **name** | string (Required) | The name of the connection. This must be unique - **if submitting the name of an existing** connection it will return an error. |
| **type** | string (Required) | The user defined type of connection. |
| **details** | Dict (Required) | User defined configuration details for the data connection.  These can be `{'username':'dataperson', 'password':'datapassword', 'port': 3339}`, or `{'token':'abcde123==', 'host':'example.com', 'port:1234'}`, or other user defined combinations.  |

* **IMPORTANT NOTE**:  Data connections names **must** be unique.  Attempting to create a data connection with the same `name` as an existing data connection will result in an error.

See the `statsmodel_forecast_inputs` and `statsmodel_forecast_outputs` details listed above for the table schema used for our example.


In [5]:
bigquery_connection_name = f'bq-wl-dev' # name of the connection in wallaroo. must be unique
bigquery_connection_type = "BIGQUERY"   # connection type (info only)
bigquery_connection_details = json.load(open('./resources/bigquery_service_account.json')) # creds and nother important info

# the convenience command to get or create the connection
connection = get_connection(bigquery_connection_name, bigquery_connection_type, bigquery_connection_details)

display(connection)

Field,Value
Name,bq-wl-dev
Connection Type,BIGQUERY
Details,*****
Created At,2023-06-20T16:44:02.933475+00:00
Linked Workspaces,[]


In [6]:
connection.details().keys()

dict_keys(['type', 'auth_uri', 'client_id', 'token_uri', 'project_id', 'private_key', 'client_email', 'private_key_id', 'universe_domain', 'client_x509_cert_url', 'auth_provider_x509_cert_url'])

In [7]:
# look at the project id
connection.details()['project_id']

'wallaroo-dev-253816'

In [8]:
wl.list_connections()
# wl.get_connection(bigquery_connection_name)

name,connection type,details,created at,linked workspaces
bigqueryhouseinputs,BIGQUERY,*****,2023-05-23T14:35:26.896064+00:00,['bigqueryworkspace']
bigqueryhouseoutputs,BIGQUERY,*****,2023-05-23T14:35:26.932685+00:00,['bigqueryworkspace']
bigqueryhouseinputs-jcw,BIGQUERY,*****,2023-05-23T14:37:22.103147+00:00,['bigqueryworkspace-jcw']
bigqueryhouseoutputs-jcw,BIGQUERY,*****,2023-05-23T14:37:22.141179+00:00,['bigqueryworkspace-jcw']
bigqueryhouseinputs{suffix},BIGQUERY,*****,2023-05-23T15:05:29.850628+00:00,"['bigqueryworkspacekbcy', 'bigqueryworkspacechbp']"
bigqueryhouseoutputs{suffix},BIGQUERY,*****,2023-05-23T15:05:30.298941+00:00,"['bigqueryworkspacekbcy', 'bigqueryworkspacechbp']"
bigqueryforecastinputsrklr,BIGQUERY,*****,2023-05-23T15:05:58.206726+00:00,['bigquerystatsmodelworkspacerklr']
bigqueryforecastoutputsrklr,BIGQUERY,*****,2023-05-23T15:05:58.673528+00:00,['bigquerystatsmodelworkspacerklr']
bigqueryforecastinputsgztp,BIGQUERY,*****,2023-05-23T15:19:58.159691+00:00,['bigquerystatsmodelworkspacegztp']
bigqueryforecastoutputsgztp,BIGQUERY,*****,2023-05-23T15:19:58.195628+00:00,['bigquerystatsmodelworkspacegztp']


## Test connection

In [9]:
# For this test, these are assumed to exist

dataset = "bikerental_forecast_demo" # the schema
table = "bikerentals"

In [10]:
# we've already gotten the connection above

# set the credentials
bigquery_credentials = service_account.Credentials.from_service_account_info(connection.details())

# start the client
bigqueryclient = bigquery.Client(
    credentials=bigquery_credentials, 
    project=connection.details()['project_id']
)

In [11]:
tablename = f'{dataset}.{table}'
sql = f'select * from {tablename} limit 5'
print(sql)

df = bigqueryclient.query(sql).to_dataframe()
df

select * from bikerental_forecast_demo.bikerentals limit 5


Unnamed: 0,dteday,season,holiday,weekday,workingday,site_id,cnt
0,2011-01-02,1,0,0,0,site0001,819
1,2011-01-09,1,0,0,0,site0001,1002
2,2011-01-16,1,0,0,0,site0001,677
3,2011-01-23,1,0,0,0,site0001,778
4,2011-01-30,1,0,0,0,site0001,667


In [12]:
# information for later
sites = bigqueryclient.query(f"select distinct site_id from {tablename}").to_dataframe()
sites

Unnamed: 0,site_id
0,site0001
1,site0002
2,site0003
3,site0004
4,site0005
5,site0006
6,site0007
7,site0008
8,site0009
9,site0010


## Attach the connector to a workspace


In [13]:
def get_workspace(name):
    workspace = None
    for ws in wl.list_workspaces():
        if ws.name() == name:
            workspace= ws
    if(workspace == None):
        workspace = wl.create_workspace(name)
    return workspace

workspace = get_workspace('bikerental-nbz')

workspace.add_connection('bq-wl-dev')
workspace.list_connections()

name,connection type,details,created at,linked workspaces
bq-wl-dev,BIGQUERY,*****,2023-06-20T16:44:02.933475+00:00,['bikerental-nbz']


In [15]:
bigqueryclient.close()