# Install basic requirements

In [1]:
pip install -U whylogs pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import whylogs
import pandas as pd

# Load example data batches

The example data is prepared from our public S3 bucket. You can use your own data if you want if you have multiple batches of data.

In [3]:
pdfs = []
for i in range(1, 8):
    path = f"https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_{i}.csv"
    print(f"Loading data from {path}")
    df = pd.read_csv(path)
    pdfs.append(df)

Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_1.csv
Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_2.csv
Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_3.csv
Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_4.csv
Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_5.csv
Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_6.csv
Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_7.csv


In [4]:
pdfs[0].describe()

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,desc,...,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
count,407.0,407.0,0.0,407.0,407.0,407.0,407.0,407.0,407.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,12548.717445,115863100.0,,14203.746929,14203.746929,14202.948403,13.514054,418.020344,78818.956069,,...,,,,,,,,,,
std,125.354772,1207642.0,,9351.142374,9351.142374,9350.997874,5.446881,271.096531,55864.939403,,...,,,,,,,,,,
min,12325.0,112153800.0,,1000.0,1000.0,1000.0,5.32,34.22,0.0,,...,,,,,,,,,,
25%,12442.5,115076900.0,,7000.0,7000.0,7000.0,9.93,235.58,43325.0,,...,,,,,,,,,,
50%,12550.0,115700400.0,,12000.0,12000.0,12000.0,12.62,357.25,63300.0,,...,,,,,,,,,,
75%,12653.5,116824500.0,,20000.0,20000.0,20000.0,16.02,553.515,95000.0,,...,,,,,,,,,,
max,12862.0,118159200.0,,40000.0,40000.0,40000.0,30.99,1417.71,495000.0,,...,,,,,,,,,,


# Configure whylogs

`whylogs`, by default, does not send statistics to WhyLabs.

There are a few small steps you need to set up. If you haven't got the access key, please onboard with WhyLabs here: <https://hub.whylabsapp.com>.

**WhyLabs only requires whylogs API - your raw data never leaves your premise.**

In [5]:
from whylogs.app import Session
from whylogs.app.writers import WhyLabsWriter
import os
import datetime

In [None]:
import getpass

# set your org-id here
print("Enter your WhyLabs Org ID")
os.environ["WHYLABS_DEFAULT_ORG_ID"] = input()
# set your API key here
print("Enter your WhyLabs API key")
os.environ["WHYLABS_API_KEY"] = getpass.getpass()
print("Using API Key ID: ", os.environ["WHYLABS_API_KEY"][0:10])

## Creating session

Once the environments are set, let's create a whylogs session with a WhyLabs writer.

Note that you can add your local writer or S3 writer if you want here. Check out the API docs for more information.

In [7]:
# create WhyLabs session
writer = WhyLabsWriter()
session = Session(writers=[writer])

## Logging to WhyLabs

Ensure you have a **model ID** (also called **dataset ID**) before you start!

### Dataset Timestamp
* To avoid confusion, it's recommended that you use UTC
* If you don't set `dataset_timestamp` parameter, it'll default to `UTC` now
* WhyLabs supports real time visualization when the timestamp is **within the last 7 days**. Anything older than than will be picked up when we run our batch processing
* **If you log two profiles for the same day with different timestamps (12:00 vs 12:01), they are merged to the same batch**

### Logging Different Batches of Data
* We'll give the profiles different **dates**
* Create a new logger for each date. Note that the logger needs to be `closed` to flush out the data

In [None]:
print("Enter your model ID from WhyLabs:")
model_id = input()
reference_profile = None
for i, df in enumerate(pdfs):
    # walking backwards. Each dataset has to map to a date to show up as a different batch
    # in WhyLabs
    dt = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=i)
    
    # Create new logger for date
    with session.logger(tags={"datasetId": model_id}, dataset_timestamp=dt) as ylog:
        print("Log data frame for ", dt)
        ylog.log_dataframe(df)
        # we will keep a reference to the first profile for us as a baseline for monitoring
        if (i==0):
            reference_profile = ylog.profile

In [9]:
# Ensure everything is flushed
session.close()

We still have a reference to the first profile, for this demo we will use this dataframe's profile and upload it as a reference profile for monitoring on Whylabs

In [None]:
# You can rename the reference profile alias, this will show up when choosing a baseline on the monitoring settings page of Whylabs
reference_profile_alias = "demo-reference-profile"
reference_profile.to_summary()

The reference profile can be uploaded using a whylabs_client directly. First, we need to reference the profile as a file on disk, so write it out.

In [None]:
import tempfile

# write out the profile we just 
tmp_dir = tempfile.mkdtemp()
profile_path = os.path.join(tmp_dir, "reference-profile.bin")
reference_profile.write_protobuf(profile_path)
print(f"Reference profile written to temporary file in preparation to upload to Whylabs as a reference profile: {profile_path}")

The whylabs_client will construct a request to upload this as a reference profile, using the org-id, model-id and api-key entered above.

In [None]:
import requests
import whylabs_client
from whylabs_client.api.log_api import LogApi
from whylabs_client.model.log_reference_request import LogReferenceRequest

# Now setup some of the inputs required to make the request to upload to Whylabs using the whylabs_client
whylabs_api_endpoint = "https://api.whylabsapp.com"
api_key = os.environ["WHYLABS_API_KEY"]
print(f"Using API key ID: {api_key[:10]} and endpoint {whylabs_api_endpoint}")
config = whylabs_client.Configuration(host=whylabs_api_endpoint, api_key={"ApiKeyAuth": api_key}, discard_unknown_keys=True)
api_log_client = whylabs_client.ApiClient(config)
log_api = LogApi(api_log_client)

org_id = reference_profile.tags.get("orgId", os.environ.get("WHYLABS_DEFAULT_ORG_ID"))
dataset_id = reference_profile.tags.get("datasetId", os.environ.get("WHYLABS_DEFAULT_DATASET_ID"))
dataset_timestamp = int(reference_profile.dataset_timestamp.timestamp() * 1000)
alias = reference_profile_alias

try:
    with open(profile_path, "rb") as f:
        request = LogReferenceRequest(dataset_timestamp=dataset_timestamp, alias=alias)
        print(f"Making initial call to log_reference to get upload url for {alias} and in [{org_id}] for [{dataset_id}] using request: {request}")
        async_result = log_api.log_reference(org_id=org_id, model_id=dataset_id, log_reference_request=request, async_req=True)
        result = async_result.get()
        upload_url = result["upload_url"]
        print(f"got async_result from log_reference, upload url is: {upload_url[:140]}")
        print(f"About to upload reference profile...")
        http_response = requests.put(upload_url, data=f.read())
        if http_response.ok:
            print(f"Done uploading reference profile with alias: {alias} to: {upload_url[:140]} with API token ID: {api_key[:10]}")
        else:
            print(
                f"Failed to upload reference profile with alias: {alias} to: {upload_url[:140]} with API token ID: {api_key[:10]} to "
                + f"{whylabs_api_endpoint}: unexpected HTTP status {http_response}"
            )
except Exception as e:
    print(f"Failed to upload reference profile: {e}.")

## Voila

* Now check the application to see if your **statistics** are in!!
* Check the monitoring settings page, if you change the toggle from Baseline from trailing window to "Compare to reference profile", you can select the reference profile we just uploading which should show up with the text from our 'alias'


In [None]:
from IPython.display import display, Markdown
url = f"https://hub.whylabsapp.com/models/{dataset_id}/monitor-settings"
content = Markdown(f"url here: {url}")
display(content)