In [1]:
%matplotlib inline

# Getting started with Embeddings

This notebook will briefly cover how to run `Embedding` workflows. 

For more information please [read the docs](https://docs.openprotein.ai/).

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import json

## Setup

Connect to the OpenProtein backend with your credentials:

In [3]:
import openprotein

with open('secrets.config', 'r') as f:
    config = json.load(f)

session = openprotein.connect(username= config['username'], password= config['password'])

## Model metadata 

You can list the available models, and fetch metadata for more information (inc publications and DOIs where available):

In [4]:
session.embedding.list_models()

[esm1b_t33_650M_UR50S,
 esm1v_t33_650M_UR90S_1,
 esm1v_t33_650M_UR90S_2,
 esm1v_t33_650M_UR90S_3,
 esm1v_t33_650M_UR90S_4,
 esm1v_t33_650M_UR90S_5,
 esm2_t12_35M_UR50D,
 esm2_t30_150M_UR50D,
 esm2_t33_650M_UR50D,
 esm2_t36_3B_UR50D,
 esm2_t6_8M_UR50D,
 poet,
 prot-seq,
 rotaprot-large-uniref50w,
 rotaprot-large-uniref90-ft]

You can view more information on each model:

In [5]:
esm_model = session.embedding.list_models()[0]
esm_model.metadata.dict()['description']

{'citation_title': 'Biological Structure and Function Emerge from Scaling Unsupervised Learning to 250 Million Protein Sequences',
 'doi': '10.1101/622803',
 'summary': 'ESM1b model with 650M parameters'}

or with:

In [6]:
session.embedding.prot_seq?

[0;31mType:[0m            ProtembedModel
[0;31mString form:[0m     prot-seq
[0;31mFile:[0m            ~/work/openprotein-python/openprotein/api/embedding.py
[0;31mDocstring:[0m      
Masked protein language model (~300M parameters) trained on UniRef50 with contact and secondary structure prediction as secondary objectives. Uses random Fourier position embeddings and FlashAttention to enable fast inference.
max_sequence_length = 1024
supported outputs = ['attn', 'embed', 'logits']
supported tokens = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'X', 'O', 'U', 'B', 'Z'] 
[0;31mClass docstring:[0m Class providing inference endpoints for protein embedding models served by OpenProtein.

There's data available on supported tokens and outputs too:

In [7]:
esm_model.metadata.dict()

{'model_id': 'esm1b_t33_650M_UR50S',
 'description': {'citation_title': 'Biological Structure and Function Emerge from Scaling Unsupervised Learning to 250 Million Protein Sequences',
  'doi': '10.1101/622803',
  'summary': 'ESM1b model with 650M parameters'},
 'max_sequence_length': 1022,
 'dimension': 1280,
 'output_types': ['attn', 'embed', 'logits'],
 'input_tokens': ['A',
  'R',
  'N',
  'D',
  'C',
  'Q',
  'E',
  'G',
  'H',
  'I',
  'L',
  'K',
  'M',
  'F',
  'P',
  'S',
  'T',
  'W',
  'Y',
  'V',
  'X',
  'O',
  'U',
  'B',
  'Z'],
 'output_tokens': ['<cls>',
  '<pad>',
  '<eos>',
  '<unk>',
  'L',
  'A',
  'G',
  'V',
  'S',
  'E',
  'R',
  'T',
  'I',
  'D',
  'P',
  'K',
  'Q',
  'N',
  'F',
  'Y',
  'M',
  'H',
  'W',
  'C',
  '<null_0>',
  'B',
  'U',
  'Z',
  'O',
  '.',
  '-',
  '<null_1>',
  'X'],
 'token_descriptions': [[{'id': 0,
    'token': '<cls>',
    'primary': True,
    'description': 'Start token'}],
  [{'id': 1,
    'token': '<pad>',
    'primary': True,
  

## Making requests

We can make embedding requests from the model directly or from the API:

In [8]:
# dummy data
sequences= ["AAAAPLHLALA".encode()]

In [9]:

esm_job = esm_model.embed(sequences=sequences)
esm_job.job

Job(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='3f55dfcc-1af1-462a-8aec-843ba6aabbda', job_type='/embeddings/embed', created_date=datetime.datetime(2024, 4, 3, 9, 29, 53, 253030, tzinfo=datetime.timezone.utc), start_date=datetime.datetime(2024, 4, 3, 9, 29, 53, 272956, tzinfo=datetime.timezone.utc), end_date=datetime.datetime(2024, 4, 3, 9, 29, 53, 272956, tzinfo=datetime.timezone.utc), prerequisite_job_id=None, progress_message=None, progress_counter=100, num_records=1, sequence_length=None)

In [11]:
embedjob = session.embedding.prot_seq.embed(sequences= sequences )
embedjob.job

Job(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='90cbbccb-6c25-4c64-96ab-3e8b8dc16545', job_type='/embeddings/embed', created_date=datetime.datetime(2024, 4, 3, 9, 30, 11, 175222, tzinfo=datetime.timezone.utc), start_date=datetime.datetime(2024, 4, 3, 9, 30, 11, 199390, tzinfo=datetime.timezone.utc), end_date=datetime.datetime(2024, 4, 3, 9, 30, 11, 199391, tzinfo=datetime.timezone.utc), prerequisite_job_id=None, progress_message=None, progress_counter=100, num_records=1, sequence_length=None)

## Getting results

You can get the results by `wait()` which will wait for the job to complete:

In [12]:
results = embedjob.wait(verbose=True) # wait for results

Waiting: 100%|██████████| 100/100 [00:00<00:00, 9108.15it/s, status=SUCCESS]
Retrieving: 100%|██████████| 1/1 [00:00<00:00, 26.32it/s]


In [13]:
results[0][0],results[0][1].shape

(b'AAAAPLHLALA', (1024,))

In [14]:
results[0][1][0:3]

array([ 0.35324928,  0.20608798, -2.9375222 ], dtype=float32)

In [15]:
embedjob.done()

True

You can aso fetch results by sequence (useful for when we have many sequence embeddings!):

In [16]:
embedjob.get_item(b"AAAAPLHLALA")[0:3]

array([ 0.35324928,  0.20608798, -2.9375222 ], dtype=float32)

Lastly, you can also use the `get()` method as with other workflows:

In [17]:
embedjob.get()

[(b'AAAAPLHLALA',
  array([ 0.35324928,  0.20608798, -2.9375222 , ..., -0.5398171 ,
          0.44970703, -2.0318716 ], dtype=float32))]

## Resume workflows 

Lastly, it's possible to resume from where you left off with the job id:

In [18]:
embedjob_job_id = esm_job.job.job_id

In [19]:
reloaded_job = session.load_job(embedjob_job_id)
reloaded_job.job

Job(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='3f55dfcc-1af1-462a-8aec-843ba6aabbda', job_type='/embeddings/embed', created_date=datetime.datetime(2024, 4, 3, 9, 29, 53, 253030), start_date=datetime.datetime(2024, 4, 3, 9, 29, 53, 272956), end_date=datetime.datetime(2024, 4, 3, 9, 29, 53, 272956), prerequisite_job_id=None, progress_message=None, progress_counter=100, num_records=None, sequence_length=None)

In [20]:
reloaded_job.sequences

[b'AAAAPLHLALA']