# Squirro machine learning service walkthrough

## Settings

In [107]:
CLUSTER='http://localhost'
TOKEN=''
PROJECT_ID='fam6AkhDQUKyvmHq57IDnQ'

## Get Squirro client

In [108]:
from squirro_client import SquirroClient
client = SquirroClient(client_id=None, client_secret=None, cluster=CLUSTER)
client.authenticate(refresh_token=TOKEN)

## Create machine learning workflow

The workflow defines:
    - what data you'd like to train and infer on
    - the keywords you'll use as features and labels
    - what normalization/filtering/tokenization steps are required to manipulate the text
    - what models you want to use and their associated hyperparameters

In [109]:
ml_workflow = {
  "dataset": {
    "train": {"query_string": "dataset:train (label:sci.space OR label:soc.religion.christian OR label:alt.atheism)"},
    "process": {"query_string": "dataset:test (label:sci.space OR label:soc.religion.christian OR label:alt.atheism)"}
  },
  "analyzer": {
    "type": "classification",
    "tag_field": "keywords.pred_label",
    "label_field": "keywords.label"
  },
  "pipeline": [{
    "step": "loader",
    "type": "squirro_query",
    "fields": ["body", "title", "keywords.label"]
  },{
    "step": "filter",
    "type": "empty",
    "fields": ["body", "title", "keywords.label"]
  },{
    "step": "filter",
    "type": "join",
    "input_field": "keywords.label",
    "output_field": "keywords.label"
  },{
    "step": "filter",
    "type": "merge",
    "input_fields": ["body", "title"],
    "output_field": "text"
  },{
    "step": "normalizers",
    "types": ["html", "punctuation", "lowercase", "character"],
    "fields": ["text"]
  },{
    "step": "tokenizer",
    "type": "spaces",
    "fields": ["text"]
  },{
    "step": "embedder",
    "type": "dictionary",
    "batch_size": 1024,
    "input_field": "text",
    "output_field": "indexed_text"
  },{
    "step": "checkpoint",
    "type": "disk",
    "do_randomize": True,
    "batch_size": 1
  },{
    "step": "classifier",
    "type": "cnn_seq2one",
    "batch_size": 1024,
    "dict_name": "dictionary",
    "dropout_fraction": 0.5,
    "embedding_dim": 50,
    "explanation_field": "explanantion",
    "input_field": "indexed_text",
    "label_field": "keywords.label",
    "labels": ["soc.religion.christian", "alt.atheism", "sci.space"],
    "max_sequence_length": 1000,
    "mini_batch_size": 64,
    "n_epochs": 10,
    "output_field": "keywords.pred_label"
  },{
    "step": "saver",
    "type": "squirro_item",
    "batch_size": 1000,
    "fields": ["keywords.pred_label"]
  }]
}

## Upload workflow

This hands the workflow configuration (and any local pre-trained models) to the Squirro ML service.

In [110]:
print len(client.get_machinelearning_workflows(PROJECT_ID).get('machinelearning_workflows'))

0


In [111]:
ml_workflow_id = client.new_machinelearning_workflow(
    PROJECT_ID, name='e2e_cnn', config=ml_workflow).get('id')

## Create training job

We now tell Squirro that we want a training job for the ML workflow we just uploaded. This will train the models we defined. If nothing is in the queue, it should start immediately.

In [112]:
print len(client.get_machinelearning_jobs(
    PROJECT_ID, ml_workflow_id=ml_workflow_id).get('machinelearning_jobs'))

0


In [113]:
training_job_id = client.new_machinelearning_job(
    PROJECT_ID, ml_workflow_id=ml_workflow_id, type='training').get('id')

In [114]:
import time
def wait_for_ml_job(project_id, ml_workflow_id, ml_job_id):
    """Wait for ML job to finish"""
    while True:
        job = client.get_machinelearning_job(
            project_id, ml_workflow_id, ml_job_id).get('machinelearning_job')
        if job.get('last_error_at') is not None or job.get('last_success_at') is not None:
            print job
            break
        time.sleep(1)
    return job

In [115]:
print wait_for_ml_job(PROJECT_ID, ml_workflow_id, training_job_id)

{u'total_runs': 1, u'next_run_time_at': u'2018-05-25T13:50:36', u'healthy': True, u'last_error': None, u'created_at': u'2018-05-24T08:37:05', u'modified_at': u'2018-05-24T08:46:13', u'last_error_at': None, u'ml_workflow_id': u'PF4pK76tSIKEW1Tj1-M6Fg', u'last_success_at': u'2018-05-24T08:46:13', u'type': u'training', u'id': u'FMViW1UeSQ-b6-NO6poFXQ', u'error_count': 0}
{u'total_runs': 1, u'next_run_time_at': u'2018-05-25T13:50:36', u'healthy': True, u'last_error': None, u'created_at': u'2018-05-24T08:37:05', u'modified_at': u'2018-05-24T08:46:13', u'last_error_at': None, u'ml_workflow_id': u'PF4pK76tSIKEW1Tj1-M6Fg', u'last_success_at': u'2018-05-24T08:46:13', u'type': u'training', u'id': u'FMViW1UeSQ-b6-NO6poFXQ', u'error_count': 0}


## Test on unlabeled data

Just to make sure the model training is succesful, we try a synchronous inference on a few test items. This `run_machinelearning_workflow` is the same command used in the ML service pipelet: https://github.com/squirro/delivery/tree/master/templates/pipelets/machinelearning.

In [116]:
items = client.query(
    PROJECT_ID,
    query="dataset:test (label:sci.space OR label:soc.religion.christian OR label:alt.atheism)",
    fields=["body", "title", "keywords"],
    count=3
).get('items')
print(len(items))

3


In [117]:
items = client.run_machinelearning_workflow(
    PROJECT_ID, ml_workflow_id=ml_workflow_id, data={'items': items}).get('items')
for item in items:
    print item.get('keywords').get('label'), item.get('keywords').get('pred_label')

[u'soc.religion.christian'] [{u'soc.religion.christian': 0.9998042583}, {u'sci.space': 0.0}, {u'alt.atheism': 0.0001957256}]
[u'soc.religion.christian'] [{u'soc.religion.christian': 1.0}, {u'sci.space': 0.0}, {u'alt.atheism': 8.9e-09}]
[u'soc.religion.christian'] [{u'soc.religion.christian': 0.0295527279}, {u'sci.space': 0.0005727312}, {u'alt.atheism': 0.9698745608}]


## Create inference job

For regularly changing datasets, it is advantageous to set up scheduled inference jobs. These will run asynchronously to free up ingestion. Again, if nothing is in the queue, these should run immediately.

In [118]:
print len(client.get_machinelearning_jobs(PROJECT_ID, ml_workflow_id=ml_workflow_id))

1


In [119]:
inference_job_id = client.new_machinelearning_job(
    PROJECT_ID, ml_workflow_id=ml_workflow_id, type='inference').get('id')

In [120]:
print wait_for_ml_job(PROJECT_ID, ml_workflow_id, inference_job_id)

{u'total_runs': 1, u'next_run_time_at': u'2018-05-25T08:37:35', u'healthy': True, u'last_error': None, u'created_at': u'2018-05-24T08:46:21', u'modified_at': u'2018-05-24T08:46:28', u'last_error_at': None, u'ml_workflow_id': u'PF4pK76tSIKEW1Tj1-M6Fg', u'last_success_at': u'2018-05-24T08:46:28', u'type': u'inference', u'id': u',hSQyjQhQPeq5Bw6o2_PxQ', u'error_count': 0}
{u'total_runs': 1, u'next_run_time_at': u'2018-05-25T08:37:35', u'healthy': True, u'last_error': None, u'created_at': u'2018-05-24T08:46:21', u'modified_at': u'2018-05-24T08:46:28', u'last_error_at': None, u'ml_workflow_id': u'PF4pK76tSIKEW1Tj1-M6Fg', u'last_success_at': u'2018-05-24T08:46:28', u'type': u'inference', u'id': u',hSQyjQhQPeq5Bw6o2_PxQ', u'error_count': 0}


## Analyze training

When training any model, it is important to iteratively check how well your models are doing. In the future, this functionality will be largely in the GUI. For now, however, we can check via libNLP (see libNLP walkthrough).

# Gotchyas

- multi-node training jobs not currently working
- underfitting
- overfitting
- hyperparameter tuning
- make sure documents make it through the pipeline
- know your baselines
- custom steps

## reset

In [121]:
for ml_workflow in client.get_machinelearning_workflows(PROJECT_ID).get('machinelearning_workflows'):
    client.delete_machinelearning_workflow(PROJECT_ID, ml_workflow_id=ml_workflow.get('id'))