In [1]:
%matplotlib inline

# Demo of Embeddings workflow functionality 

This notebook will briefly cover how to run `Embedding` workflows. 

For more information please [read the docs](https://docs.openprotein.ai/).

In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import time
import json
import pandas as pd
import seaborn as sns 
sns.set() 

from AWSTools.Batchtools.batch_utils import fakeseq # Used for creating fake protein sequences for testing


## Setup

Connect to the OpenProtein backend with your credentials:

In [3]:
import openprotein

with open('../../../secrets.config', 'r') as f:
    config = json.load(f)

session = openprotein.connect(username= config['username'], password= config['password'])

## Model metadata 

You can list the available models, and fetch metadata for more information (inc publications and DOIs where available):

In [4]:
session.embedding.list_models()

[esm1b_t33_650M_UR50S,
 esm1v_t33_650M_UR90S_1,
 esm1v_t33_650M_UR90S_2,
 esm1v_t33_650M_UR90S_3,
 esm1v_t33_650M_UR90S_4,
 esm1v_t33_650M_UR90S_5,
 esm2_t12_35M_UR50D,
 esm2_t30_150M_UR50D,
 esm2_t33_650M_UR50D,
 esm2_t36_3B_UR50D,
 esm2_t6_8M_UR50D,
 prot-seq,
 rotaprot-large-uniref50w,
 rotaprot-large-uniref90-ft,
 test-model]

You can view more information on each model:

In [5]:
esm_model = session.embedding.list_models()[0]
esm_model.metadata.dict()['description']

{'citation_title': 'Biological Structure and Function Emerge from Scaling Unsupervised Learning to 250 Million Protein Sequences',
 'doi': '10.1101/622803',
 'summary': 'ESM1b model with 650M parameters'}

There's data available on supported tokens and outputs too:

In [6]:
esm_model.metadata.dict()

{'model_id': 'esm1b_t33_650M_UR50S',
 'description': {'citation_title': 'Biological Structure and Function Emerge from Scaling Unsupervised Learning to 250 Million Protein Sequences',
  'doi': '10.1101/622803',
  'summary': 'ESM1b model with 650M parameters'},
 'max_sequence_length': 1022,
 'dimension': 1280,
 'output_types': ['attn', 'embed', 'logits'],
 'input_tokens': ['A',
  'R',
  'N',
  'D',
  'C',
  'Q',
  'E',
  'G',
  'H',
  'I',
  'L',
  'K',
  'M',
  'F',
  'P',
  'S',
  'T',
  'W',
  'Y',
  'V',
  'X',
  'O',
  'U',
  'B',
  'Z'],
 'output_tokens': ['<cls>',
  '<pad>',
  '<eos>',
  '<unk>',
  'L',
  'A',
  'G',
  'V',
  'S',
  'E',
  'R',
  'T',
  'I',
  'D',
  'P',
  'K',
  'Q',
  'N',
  'F',
  'Y',
  'M',
  'H',
  'W',
  'C',
  '<null_0>',
  'B',
  'U',
  'Z',
  'O',
  '.',
  '-',
  '<null_1>',
  'X'],
 'token_descriptions': [[{'id': 0,
    'token': '<cls>',
    'primary': True,
    'description': 'Start token'}],
  [{'id': 1,
    'token': '<pad>',
    'primary': True,
  

## Making requests

We can make embedding requests from the model directly or from the API:

In [7]:
# dummy data
sequences= ["AAAAPLHLALA".encode()]

In [8]:

esm_job = esm_model.embed(sequences=sequences)
esm_job.job

Job(status=<JobStatus.PENDING: 'PENDING'>, job_id='f304e749-6d36-4a8b-85df-4206acfaf50b', job_type='/embeddings/embed_reduced', created_date=datetime.datetime(2023, 7, 28, 2, 57, 16, 347282, tzinfo=datetime.timezone.utc), start_date=None, end_date=None, prerequisite_job_id=None, progress_message=None, progress_counter=0, num_records=1)

In [9]:
embedjob = session.embedding.embed(model="esm1b_t33_650M_UR50S", sequences= sequences )
embedjob.job

Job(status=<JobStatus.PENDING: 'PENDING'>, job_id='84ed7e57-20ab-469e-bbca-0560bd294c30', job_type='/embeddings/embed_reduced', created_date=datetime.datetime(2023, 7, 28, 2, 57, 16, 361915, tzinfo=datetime.timezone.utc), start_date=None, end_date=None, prerequisite_job_id=None, progress_message=None, progress_counter=0, num_records=1)

## Getting results

You can get the results by `wait()` which will wait for the job to complete:

In [10]:
results = embedjob.wait(verbose=True) # wait for results

Waiting: 100%|██████████| 100/100 [00:00<00:00, 7605.54it/s, status=SUCCESS]
Retrieving: 100%|██████████| 1/1 [00:00<00:00, 13.12it/s]


In [11]:
results[0][0],results[0][1].shape

(b'AAAAPLHLALA', (1280,))

In [12]:
results[0][1][0:3]

array([ 0.15882437, -0.03162469,  0.11416737], dtype=float32)

In [13]:
esm_job.done()

False

In [14]:
results2 = esm_job.wait(verbose=True) # wait for results

Waiting: 100%|██████████| 100/100 [00:00<00:00, 4872.51it/s, status=SUCCESS]
Retrieving: 100%|██████████| 1/1 [00:00<00:00, 36.59it/s]


In [15]:
results2[0][0],results2[0][1].shape

(b'AAAAPLHLALA', (1280,))

In [16]:
results2[0][1][0:3]

array([ 0.15882437, -0.03162469,  0.11416737], dtype=float32)

You can aso fetch results by sequence (useful for when we have many sequence embeddings!):

In [17]:
esm_job.get_item(b"AAAAPLHLALA")[0:3]

array([ 0.15882437, -0.03162469,  0.11416737], dtype=float32)

Lastly, you can also use the `get()` method as with other workflows:

In [18]:
esm_job.get()

[(b'AAAAPLHLALA',
  array([ 0.15882437, -0.03162469,  0.11416737, ..., -0.17913206,
          0.19573624,  0.13490376], dtype=float32))]

## Resume workflows 

Lastly, it's possible to resume from where you left off with the job id:

In [19]:
esm_job_id = esm_job.job.job_id

In [20]:
reloaded_job = session.embedding.load_job(esm_job_id)
reloaded_job.job

Job(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='f304e749-6d36-4a8b-85df-4206acfaf50b', job_type='/embeddings/embed_reduced', created_date=datetime.datetime(2023, 7, 28, 2, 57, 16, 347282), start_date=datetime.datetime(2023, 7, 28, 2, 57, 16, 431629), end_date=datetime.datetime(2023, 7, 28, 2, 57, 16, 431629), prerequisite_job_id=None, progress_message=None, progress_counter=100, num_records=None)

In [21]:
reloaded_job.sequences

[b'AAAAPLHLALA']

In [22]:
reloaded_job.get_item(b"AAAAPLHLALA")

array([ 0.15882437, -0.03162469,  0.11416737, ..., -0.17913206,
        0.19573624,  0.13490376], dtype=float32)