# Tabular data example

In [1]:
from velour.client import Client, Dataset, Model, ClientException
from velour.enums import TaskType
from velour import schemas

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
from velour.client import Client

client = Client("http://localhost:8000")

Succesfully connected to http://localhost:8000/.


In [3]:
dset = load_breast_cancer()
X, y, target_names = dset["data"], dset["target"], dset["target_names"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [4]:
X_train.shape, y_train[:4], target_names

((426, 30), array([1, 1, 1, 0]), array(['malignant', 'benign'], dtype='<U9'))

In [5]:
pipe.fit(X_train, y_train)

In [6]:
y_train_probs = pipe.predict_proba(X_train)
y_test_probs = pipe.predict_proba(X_test)

In [7]:
y_train_probs[:4]

array([[1.55129176e-03, 9.98448708e-01],
       [4.11697541e-01, 5.88302459e-01],
       [8.91709791e-04, 9.99108290e-01],
       [9.99923921e-01, 7.60793088e-05]])

## Velour Dataset ingestion

We now ingest the groundtruth labels into velour. For each sample, velour expects a list of `Label` objects. Each `Label` has a key and value. Allowing key/value labels and having a single row be annotated by multiple labels supports multi-label classification.

In this example there's just a single label per element and we'll set the class key to "class". The `add_groundtruth` method returns the ids of the newly created groundtruth.

Create datasets

In [8]:
# reset (only needed if restarting each run)
client.delete_dataset("breast-cancer-train")
client.delete_dataset("breast-cancer-test")

# create or get train dataset
try:
    velour_train_dataset = Dataset.create(client, "breast-cancer-train")
except ClientException:
    velour_train_dataset = Dataset.get(client, "breast-cancer-train")

# create or get test dataset
try:
    velour_test_dataset = Dataset.create(client, "breast-cancer-test")
except ClientException:
    velour_test_dataset = Dataset.get(client, "breast-cancer-test")

Format data

In [9]:
# format training groundtruths
training_groundtruths = [
    schemas.GroundTruth(
        datum=schemas.Datum(
            uid=f"train{i}",
        ),
        annotations=[
            schemas.Annotation(
                task_type=TaskType.CLASSIFICATION,
                labels=[schemas.Label(key="class", value=target_names[t])]
            )
        ]
    )
    for i, t in enumerate(y_train)
]

# format testing groundtruths
testing_groundtruths = [
    schemas.GroundTruth(
        datum=schemas.Datum(
            uid=f"test{i}",
        ),
        annotations=[
            schemas.Annotation(
                task_type=TaskType.CLASSIFICATION,
                labels=[schemas.Label(key="class", value=target_names[t])]
            )
        ]
    )
    for i, t in enumerate(y_test)
]

Ingest data

In [10]:
# add the training groundtruths
for gt in training_groundtruths:
    velour_train_dataset.add_groundtruth(gt)

# add the testing groundtruths
for gt in testing_groundtruths:
    velour_test_dataset.add_groundtruth(gt)

Finalize datasets, necessary for evaluation

In [11]:
velour_train_dataset.finalize()
velour_test_dataset.finalize()

<Response [200]>

## Model inference ingestion

Now we create a velour model and post the predictions on the two datasets. Each prediction should be a list of `ScoredLabel`, which consist of a label and a confidence score. The confidence scores over all of the classes in a key must sum to (approximately) 1.

Create model

In [12]:
# rest (only necessary if restarting)
client.delete_model("breast-cancer-linear-model")

# create or get model
try:
    velour_model = Model.create(client, "breast-cancer-linear-model")
except ClientException:
    velour_model = Model.get(client, "breast-cancer-linear-model")

Format predictions

In [13]:
training_predictions = [
    schemas.Prediction(
        datum=schemas.Datum(
            dataset=velour_train_dataset.name,
            uid=f"train{i}",
        ),
        annotations=[
            schemas.ScoredAnnotation(
                task_type=TaskType.CLASSIFICATION,
                scored_labels=[
                    schemas.ScoredLabel(
                        label=schemas.Label(key="class", value=target_names[j]),
                        score=p,
                    )                        
                    for j, p in enumerate(prob)
                ]
            )
        ]
    )
    for i, prob in enumerate(y_train_probs)
]

testing_predictions = [
    schemas.Prediction(
        datum=schemas.Datum(
            dataset=velour_test_dataset.name,
            uid=f"test{i}",
        ),
        annotations=[
            schemas.ScoredAnnotation(
                task_type=TaskType.CLASSIFICATION,
                scored_labels=[
                    schemas.ScoredLabel(
                        label=schemas.Label(key="class", value=target_names[j]),
                        score=p,
                    )                        
                    for j, p in enumerate(prob)
                ]
            )
        ]
    )
    for i, prob in enumerate(y_test_probs)
]

In [14]:
# add the train predictions
for pd in training_predictions:
    velour_model.add_prediction(pd)

# add the test predictions
for pd in testing_predictions:
    velour_model.add_prediction(pd)

finalize models, necessary for evaluation

In [15]:
velour_model.finalize_inferences(velour_train_dataset)
velour_model.finalize_inferences(velour_test_dataset)

evaluate

In [16]:
train_eval_job = velour_model.evaluate_classification(velour_train_dataset)
train_eval_job.wait_for_completion()

In [17]:
train_eval_job.status

<JobStatus.DONE: 'done'>

In [18]:
train_eval_job.metrics

[{'type': 'Accuracy',
  'parameters': {'label_key': 'class'},
  'value': 0.9882629107981221},
 {'type': 'ROCAUC',
  'parameters': {'label_key': 'class'},
  'value': 0.5735161989324032},
 {'type': 'Precision',
  'value': 0.9940119760479041,
  'label': {'key': 'class', 'value': 'malignant'}},
 {'type': 'Recall',
  'value': 0.9764705882352941,
  'label': {'key': 'class', 'value': 'malignant'}},
 {'type': 'F1',
  'value': 0.9851632047477744,
  'label': {'key': 'class', 'value': 'malignant'}},
 {'type': 'Precision',
  'value': 0.9845559845559846,
  'label': {'key': 'class', 'value': 'benign'}},
 {'type': 'Recall',
  'value': 0.99609375,
  'label': {'key': 'class', 'value': 'benign'}},
 {'type': 'F1',
  'value': 0.9902912621359223,
  'label': {'key': 'class', 'value': 'benign'}}]

In [19]:
train_eval_job.confusion_matrices

[{'label_key': 'class',
  'entries': [{'prediction': 'benign', 'groundtruth': 'benign', 'count': 255},
   {'prediction': 'benign', 'groundtruth': 'malignant', 'count': 4},
   {'prediction': 'malignant', 'groundtruth': 'benign', 'count': 1},
   {'prediction': 'malignant', 'groundtruth': 'malignant', 'count': 166}]}]

In [20]:
test_eval_job = velour_model.evaluate_classification(velour_test_dataset)
test_eval_job.wait_for_completion()

In [21]:
test_eval_job.metrics

[{'type': 'Accuracy',
  'parameters': {'label_key': 'class'},
  'value': 0.986013986013986},
 {'type': 'ROCAUC',
  'parameters': {'label_key': 'class'},
  'value': 0.055797790814439044},
 {'type': 'Precision',
  'value': 1.0,
  'label': {'key': 'class', 'value': 'malignant'}},
 {'type': 'Recall',
  'value': 0.9523809523809523,
  'label': {'key': 'class', 'value': 'malignant'}},
 {'type': 'F1',
  'value': 0.975609756097561,
  'label': {'key': 'class', 'value': 'malignant'}},
 {'type': 'Precision',
  'value': 0.9805825242718447,
  'label': {'key': 'class', 'value': 'benign'}},
 {'type': 'Recall',
  'value': 1.0,
  'label': {'key': 'class', 'value': 'benign'}},
 {'type': 'F1',
  'value': 0.9901960784313726,
  'label': {'key': 'class', 'value': 'benign'}}]

In [22]:
test_eval_job.confusion_matrices

[{'label_key': 'class',
  'entries': [{'prediction': 'benign', 'groundtruth': 'benign', 'count': 101},
   {'prediction': 'benign', 'groundtruth': 'malignant', 'count': 2},
   {'prediction': 'malignant', 'groundtruth': 'malignant', 'count': 40}]}]

evaluation metrics in a Pandas dataframe

In [23]:
settings_and_dfs = velour_model.get_metric_dataframes()

In [24]:
settings = settings_and_dfs[0]["settings"]
df = settings_and_dfs[0]["df"]

In [25]:
print(settings)

{'model': 'breast-cancer-linear-model', 'dataset': 'breast-cancer-train', 'gt_type': 'none', 'pd_type': 'none', 'task_type': 'classification', 'id': 1}


In [26]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
Unnamed: 0_level_1,Unnamed: 1_level_1,dataset,breast-cancer-train
type,parameters,label,Unnamed: 3_level_2
Accuracy,"{""label_key"": ""class""}",,0.988263
F1,"""n/a""",class: benign,0.990291
F1,"""n/a""",class: malignant,0.985163
Precision,"""n/a""",class: benign,0.984556
Precision,"""n/a""",class: malignant,0.994012
ROCAUC,"{""label_key"": ""class""}",,0.573516
Recall,"""n/a""",class: benign,0.996094
Recall,"""n/a""",class: malignant,0.976471


## Sanity check scikit-learn classification report

In [27]:
from sklearn.metrics import classification_report

In [28]:
y_train_preds = pipe.predict(X_train)

In [29]:
print(classification_report(y_train, y_train_preds, digits=6, target_names=target_names))

              precision    recall  f1-score   support

   malignant   0.994012  0.976471  0.985163       170
      benign   0.984556  0.996094  0.990291       256

    accuracy                       0.988263       426
   macro avg   0.989284  0.986282  0.987727       426
weighted avg   0.988330  0.988263  0.988245       426

