In [1]:
%matplotlib inline

# Getting started with PoET

This notebook will briefly cover how to run `align` and `PoET` workflows. 

For more information please [read the docs](https://docs.openprotein.ai/).

In [2]:
import matplotlib.pyplot as plt
import json
import pandas as pd

## Setup

Connect to the OpenProtein backend with your credentials:

In [3]:
import openprotein

with open('secrets.config', 'r') as f:
    config = json.load(f)

session = openprotein.connect(username= config['username'], password= config['password']) 

We will use a small sample of the AMIE PSEAE dataset as a demo, the full data is available on our [website](https://docs.openprotein.ai/walkthroughs/demo-datasets-page.html):

In [4]:
dataset = pd.read_csv("./data/AMIE_PSEAE.csv")[['sequence']]
dataset.head(2)

Unnamed: 0,sequence
0,WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMK...
1,WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMK...


## Create an MSA 

We can create an MSA either from a seed, or by uploading a ready-made file. Here we will explore the seed workflow:

In [5]:
# Create an MSA from a seed sequence
seed = dataset.sequence[0]

We'll create an MSA from a seed sequence using the `Align` module:

In [6]:
msa = session.align.create_msa(seed.encode())
print(msa)



status=<JobStatus.SUCCESS: 'SUCCESS'> job_id='20015eae-5872-4ad7-aa4a-71844c354c7d' job_type=<JobType.align_align: '/align/align'> created_date=datetime.datetime(2024, 4, 3, 9, 30, 39, 12506) start_date=None end_date=datetime.datetime(2024, 4, 3, 9, 30, 39, 12838) prerequisite_job_id=None progress_message=None progress_counter=None num_records=None sequence_length=None msa_id='20015eae-5872-4ad7-aa4a-71844c354c7d'


In [7]:
r = msa.wait() 
list(r)[0:3]

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI0004660BEB',
  '-RHGDISSSNDTVGVAVVNYKMPRLHTVAEVLDNARKIADMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARSNDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI000730B3B9',
  '-RHGDISSSNDTVGVAVVNYKMPRLHSREEVLANAQKIADMVVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMMETAVAIPGDETELLARACRKANVWGVFSLTGERHEEHPNKAPYNTLVLIDNKGEVVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISMIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNTYVAVANAAGF

We can examine our inputs:

In [8]:
list(msa.get_input("RAW"))

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA']]

and the resulting MSA (limited here to 4 sequences for brevity):

In [9]:
list(msa.get_input("GENERATED"))[0:4]

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI0004660BEB',
  '-RHGDISSSNDTVGVAVVNYKMPRLHTVAEVLDNARKIADMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARSNDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI000730B3B9',
  '-RHGDISSSNDTVGVAVVNYKMPRLHSREEVLANAQKIADMVVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMMETAVAIPGDETELLARACRKANVWGVFSLTGERHEEHPNKAPYNTLVLIDNKGEVVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISMIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNTYVAVANAAGF


## Prompt 

We can use this MSA to create a prompt with a sampling regime (see the docs for details):

In [10]:
prompt = msa.sample_prompt(num_ensemble_prompts=3, random_seed=42)


In [11]:
prompt.id # or prompt.job.job_id

'efe5a167-ff2a-49ac-8d04-0134a995a3a1'

In [12]:
# We can wait, or not, for the prompt to create
prompt.wait()

<_csv.reader at 0x7f0a4d710580>

As we specified 3 prompts above we will have 3 different prompts all drawn from the same MSA:

In [13]:
list(prompt.get_prompt(1))[0:3]

[['UniRef100_A0A194RN05',
  'FNTHIIIDNKGDIVQTYRKLHLFDESDFTSPGSHVVTPVDTPVGRIGLEICYDMRFPELSTTLGSMRADILTFPSAFTYTGMAHWHLLLRARAIENQCYVLAAAQTGHNAKRRSYGHALCVDPWGEVLADCEEEGPCYKIAEISLEKLADVRRNMPVFQHR'],
 ['UniRef100_A0A7W0G9W8',
  'GGSAILGPDGAYLAGPLYDEEGILYAELDPTRLAEERQRDPAGHYHRPDV'],
 ['UniRef100_A0A6F9EEE2',
  'RHGDISSSPDTVGVAVVNYKMPRLHTREQVLDNARKIADMIVGMKQGLPGMDLVVFPEYSTMGIMYDPDEMFETACTVPGEETEIFGRACREANTWGVFSLTGERHEEHPRKSPYNTLVLINNRGEIVQKYRKILPWAPIEGWYPGDKTYVSDGPKGLKVSLIICDDGNYPEIWRDCAMKGAELIVRPQGYMYPAKEQQIMMAKTMAWANNVYVAVANATGFDGVYSYFGHSAIIGFDGRTLGECGEEEYGIQYAELSISAIRDARQNWQSQNQLFKLLHRGYTGIYNSGDGDKGLAECPFDFYRTWVLDAKKAQENVEKITRTELTTACCPVGGLPYNGAEREA']]

In [14]:
list(prompt.get_prompt(2))[0:3]

[['UniRef100_A0A7I7RR00', 'FEFYKLLVTDAQKAQEVVESVTRDTVGVADCRVGNLPVE'],
 ['UniRef100_UPI001C6965F0',
  'GLNPSPGALVLGLVQARVPVISEPADLTATAERLAAQLRKAKKAMPSLDLLVFPEYSLNGLDPGTWLDDRLLCDLDGPEITQQAKACAEAGVWGCFSLMERNPGGAPWNSGIIVDASGEIKLYYRKMHPWVPAEPWQPGDGVPVCDGPAGSRLALIICHDGMLPEMAAREA'],
 ['UniRef100_A0A920N8X4',
  'LFNTTCLVGADGVLSKYRKVNPWIPWELHASPDDPFPVVDTELGKLGAAICYDWLFPETIRQLAFNGAEVLIRVSAYMTPPMDWWTLFNRARAAENTAYVVACNQGAAFENYPWPGGSMVVDFDGRVLAQADAGGEKVVVAPIDLAALRAERQRRDMRSHLRSEVH']]

In [15]:
list(prompt.get_prompt(3))[0:3]

[['UniRef100_A0A381V2N6',
  'HGDIGSSKDTVGTAVVNYKMPRLHSKAEVLENTRKIAEMLEGMKVGLPGLDLVIFPEYSTHGIMYDEKEMYETASSCPGEETEILGQACRNAKVWGVFSLTGERHEDHPNKAPYNTLILMNDQGEIVQKYRKIMPWVPIEGWYPGNSTYVSDGPKGLKVSLIICDDGNYPEIWRDCAMRGAELIVRCQGYMYPAKEQQVTMAKAMAWANNSYVAVSNATGFDGVYSYFGYSSIIGFDGRTLGACAEEEMGVQYAQLGISAIRDARKNSQSNNHLFKLLHRGYTGMINSGDGDKGVADCPFEFYSKWVNDPESTREMVEAMTRDTIGTEECPIEGIP'],
 ['UniRef100_A0A3A9FM73',
  'ECLSVVGAQYAPIGAVSMTDVDRNLNTLLNFMDRASGAFPGTDLIVAPEACLQGFPQSGWENALLTLEAQQVKALCEKCAELEMWGVFAFLLKSELPGAFTNTAVLVDDKGKIRHTYDKMNPWIPFETSLPGESCTVCDGPKGAKIGLIICADGDYPEIWREAAVNGANVIIRPTHYMDPWQNAWEITNKAGAYFNQVYVVAVNASGENENYSCFGRSMILGPDGNIICEGGNGVPSMIKANLYPGIIDAMRKQAVHSAPMYSYDHRGASSRKYAGTGRGTEDYRSYA'],
 ['UniRef100_UPI0008305C87',
  'RHGDISSSPDTVGVAVVNYKMPRLHTRAEVVDNAKAIAEMVVGMKSGLPGMDLVVFPEYSTQGIMYDEQEMFDTAATIPGQETAIFSEACRKARVWGVFSITGERHEDHPDKPPYNTLVLIDDNGDIVQKYRKILPWCPIEGWYPGDTTHVTVGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNCYVAVANAAGFDGVYSYFGNSALIGFDGRTLGETGEEEYGIQYAQLSVSAIRDARAHDQSQNHLFKL

In [16]:
prompt1_seqs = [i[1] for i in list(prompt.get_prompt(1))]
prompt2_seqs = [i[1] for i in list(prompt.get_prompt(2))]
prompt3_seqs = [i[1] for i in list(prompt.get_prompt(3))]

print(f"N seqs in prompt1: {len(prompt1_seqs)}, prompt2: {len(prompt2_seqs)} prompt3: {len(prompt3_seqs)}") 
print(f"Seqs found in all 3 prompts: {len(set(prompt1_seqs) & set(prompt2_seqs)  & set(prompt3_seqs))} ")

N seqs in prompt1: 45, prompt2: 47 prompt3: 42
Seqs found in all 3 prompts: 1 


In [17]:
msa.msa_id, prompt.prompt_id

('20015eae-5872-4ad7-aa4a-71844c354c7d',
 'efe5a167-ff2a-49ac-8d04-0134a995a3a1')

## Scoring with PoET

We can then use our prompt object with the Poet model from embeddings to access various Poet functions:

In [18]:
seqs = [i.encode() for i in dataset.sequence] # prepare seqs from our dataset

In [19]:
poet = session.embedding.get_model('poet')

In [20]:
scorejob = poet.score(prompt=prompt.prompt_id, sequences=seqs )


In [21]:
score_results = scorejob.wait()
score_results[0]

('sequence-01',
 b'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA',
 array([-161.7098999 , -179.46661377, -111.26499939]))

## Single site analysis with PoET

A similar flow yields a single site mutation analysis of a sequence:

In [22]:
sspjob = poet.single_site(prompt=prompt, sequence="AAPLAA".encode())


In [25]:
ssp_results = sspjob.wait()
ssp_results[b'A1R']

array([-29.60253906, -30.09863281, -30.48925781])

## Generate *de novo* sequences

Lastly, we can use the generation workflow:

In [26]:
genjob  = poet.generate(prompt=prompt.prompt_id, num_samples=10) #make 10 sequences based on our prompt


In [27]:

gen_results = genjob.wait()
gen_results[0]

('generated-sequence-1',
 b'RHGDISSSPDTVGVAVVNYKMPRLHTREEVLDNCRKIADMIEGMKQGLPGLDLVVFPEYSTQGIMYDRAEMLRTASTVPGVETDILGRACRKHKVWGVFSLTGERHEDHPAKAPYNTLVLINDRGEIVQKYRKILPWTPIEGWYPGDRTYVSKGPKGMKVSLVICDDGNYPEIWRDCAMKGAELIVRPQGYMYPAKDQQVMMARAMAWANNVYVAVVNASGYDGVYSYFGHSAIVGFDGRVLGECGSEEYGAQYAELFVSAIREARANWQAQNHLYKLLHRGYTGTLISGEDPKGDAECQFEFFRDWVLDAERTRANAERITRSSIGTEHCPINNLFED',
 array([-241.18112183, -265.58514404, -268.01791382]))

## Resuming work

You can reload a prompt, MSA or PoET job to resume where you left off:

In [28]:
old_msa = session.load_job(msa.msa_id)
old_msa.job

MSAJob(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='20015eae-5872-4ad7-aa4a-71844c354c7d', job_type=<JobType.align_align: '/align/align'>, created_date=datetime.datetime(2024, 4, 3, 9, 30, 39, 12506), start_date=None, end_date=datetime.datetime(2024, 4, 3, 9, 30, 39, 12838), prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None, sequence_length=None, msa_id='20015eae-5872-4ad7-aa4a-71844c354c7d')

The same functionality is present:

In [29]:
new_prompt = old_msa.sample_prompt(1)
new_prompt.job

PromptJob(status=<JobStatus.PENDING: 'PENDING'>, job_id='c8e3f9cc-5c39-45a6-8a59-d3e4261bd33c', job_type=<JobType.align_prompt: '/align/prompt'>, created_date=datetime.datetime(2024, 4, 3, 9, 42, 0, 871445), start_date=None, end_date=None, prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None, sequence_length=None, msa_id='c8e3f9cc-5c39-45a6-8a59-d3e4261bd33c', prompt_id='c8e3f9cc-5c39-45a6-8a59-d3e4261bd33c')

In [30]:
oldprompt = session.load_job(prompt.prompt_id)
oldprompt.job

PromptJob(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='efe5a167-ff2a-49ac-8d04-0134a995a3a1', job_type=<JobType.align_prompt: '/align/prompt'>, created_date=datetime.datetime(2024, 4, 3, 9, 30, 42, 842814), start_date=datetime.datetime(2024, 4, 3, 9, 30, 43, 686853), end_date=datetime.datetime(2024, 4, 3, 9, 30, 52, 26321), prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None, sequence_length=None, msa_id='efe5a167-ff2a-49ac-8d04-0134a995a3a1', prompt_id='efe5a167-ff2a-49ac-8d04-0134a995a3a1')

In [32]:
old_job = session.load_job(sspjob.job.job_id)
old_job.get()[b'A1N']

array([-30.65429688, -30.94726562, -31.69726562])