In [1]:
%matplotlib inline

# Demo of PoET functionality 

This notebook will briefly cover how to run `align` and `PoET` workflows. 

For more information please [read the docs](https://docs.openprotein.ai/).

In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import time
import json
import pandas as pd
import seaborn as sns 
sns.set() 

from AWSTools.Batchtools.batch_utils import fakeseq # Used for creating fake protein sequences for testing


## Setup

Connect to the OpenProtein backend with your credentials:

In [3]:
import openprotein

with open('../../../secrets.config', 'r') as f:
    config = json.load(f)

session = openprotein.connect(username= config['username'], password= config['password']) 

In [4]:
dataset = pd.read_csv("./data/core.csv")[['sequence']]
dataset.head(2)

Unnamed: 0,sequence
0,WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMK...
1,WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMK...


## Create an MSA 

We can create an MSA either from a seed, or by uploading a ready-made file. Here we will explore the seed workflow:

In [5]:
# Create an MSA from a seed sequence
seed = dataset.sequence[0]

Start a ColabFold job to create an MSA:

In [6]:
msa = session.poet.create_msa(seed.encode())
print(msa)



status=<JobStatus.SUCCESS: 'SUCCESS'> job_id='c8708fa8-e44f-4137-9b12-fec2db2bcb11' job_type='/align/align' created_date=datetime.datetime(2023, 7, 27, 8, 17, 27, 340881) start_date=None end_date=datetime.datetime(2023, 7, 27, 8, 17, 27, 350439) prerequisite_job_id=None progress_message=None progress_counter=None num_records=None msa_id='c8708fa8-e44f-4137-9b12-fec2db2bcb11'


In [7]:
r = msa.wait() 
list(r)[0:3]

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI0004660BEB',
  '-RHGDISSSNDTVGVAVVNYKMPRLHTVAEVLDNARKIADMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARSNDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI000730B3B9',
  '-RHGDISSSNDTVGVAVVNYKMPRLHSREEVLANAQKIADMVVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMMETAVAIPGDETELLARACRKANVWGVFSLTGERHEEHPNKAPYNTLVLIDNKGEVVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISMIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNTYVAVANAAGF

We can examine our inputs:

In [8]:
list(msa.get_input("RAW"))

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA']]

and the resulting MSA (limited here to 4 sequences for brevity):

In [9]:
list(msa.get_input("GENERATED"))[0:4]

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI0004660BEB',
  '-RHGDISSSNDTVGVAVVNYKMPRLHTVAEVLDNARKIADMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARSNDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI000730B3B9',
  '-RHGDISSSNDTVGVAVVNYKMPRLHSREEVLANAQKIADMVVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMMETAVAIPGDETELLARACRKANVWGVFSLTGERHEEHPNKAPYNTLVLIDNKGEVVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISMIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNTYVAVANAAGF


## Prompt 

We can use this MSA to create a prompt with a sampling regime (see the docs for details):

In [10]:
prompt = msa.sample_prompt(num_ensemble_prompts=3, random_seed=42)


In [11]:
prompt.id # or prompt.job.job_id

'83c816d8-a65d-4c6d-b939-d58db669793a'

In [12]:
# We can wait, or not, for the prompt to create
prompt.wait() #not necessary but can

<_csv.reader at 0x7f16876be2e0>

As we specified 3 prompts above we will have 3 different prompts all drawn from the same MSA:

In [13]:
list(prompt.get_prompt(1))[0:3]

[['UniRef100_A0A959K4C9',
  'GLMICYDTRFPEMARSLALAGAEIIIVPTAWPFPRVEHWQLLSRARAIENQCYVVTANRVGKDGQAIFCGNSRVIDPHGVVVSSASEDQEEIIYAEIKRDKLDFIRTRMPVFEHRRPDVY'],
 ['UniRef100_UPI00041A74DE',
  'GSVSAWDEALLIAAIQYPVPVIKRPEDIQVQVQQICKTIDSTKAGYPDLDLIVFPEYSAQGLNTKIWTYDEMLLSLESPEVDSFRQACIRNNIWGVFSLMERNEDPSQPPYNTAIIINNSGEIVLHYRKLQPWVPIEPWMPGNGMPVCGGPKGAKLAVCICHDGMFPELAREAAYKGCNVFIRISGYSTQVNDQWIWTNRTNAWQNLMYTVSVNLAGYDE'],
 ['UniRef100_A0A7W9FMQ2',
  'GGLNKSENGVVIGLVQLQLPVTVTRDDLARQTKRIVELVGKARRNNAGMDLVVFPEYALHGLSMDTNPAIMCDLDGPEVAAFKAACAEHRIWGCFSIMERNPGGNPYNSGIVIDDQGALKLYYRKLHPWVPVEPWEPGDGIPVIDGPKGAKLALIICHDGMFPEMARECAYKGAEIMIRTAGYTAPIRESWRFTNQANAFQNLMVTANVCMCGSDGTFDSMGEGMIVNFDGTVIAHGVTGRPEIITAEVRPDLVREARAGWGVENNIYQLWHRGYVAVKGGAMDCPYTFMQDMVAG']]

In [14]:
list(prompt.get_prompt(2))[0:3]

[['UniRef100_A0A194RN05',
  'FNTHIIIDNKGDIVQTYRKLHLFDESDFTSPGSHVVTPVDTPVGRIGLEICYDMRFPELSTTLGSMRADILTFPSAFTYTGMAHWHLLLRARAIENQCYVLAAAQTGHNAKRRSYGHALCVDPWGEVLADCEEEGPCYKIAEISLEKLADVRRNMPVFQHR'],
 ['UniRef100_A0A7W0G9W8',
  'GGSAILGPDGAYLAGPLYDEEGILYAELDPTRLAEERQRDPAGHYHRPDV'],
 ['UniRef100_A0A6F9EEE2',
  'RHGDISSSPDTVGVAVVNYKMPRLHTREQVLDNARKIADMIVGMKQGLPGMDLVVFPEYSTMGIMYDPDEMFETACTVPGEETEIFGRACREANTWGVFSLTGERHEEHPRKSPYNTLVLINNRGEIVQKYRKILPWAPIEGWYPGDKTYVSDGPKGLKVSLIICDDGNYPEIWRDCAMKGAELIVRPQGYMYPAKEQQIMMAKTMAWANNVYVAVANATGFDGVYSYFGHSAIIGFDGRTLGECGEEEYGIQYAELSISAIRDARQNWQSQNQLFKLLHRGYTGIYNSGDGDKGLAECPFDFYRTWVLDAKKAQENVEKITRTELTTACCPVGGLPYNGAEREA']]

In [15]:
list(prompt.get_prompt(3))[0:3]

[['UniRef100_UPI0009488FB3',
  'RHGDISSSPDTVGVAVVNYKMPRLHTKSDVLANAEQIADMIIGIKQGLPGMDLIVFPEYSTMGIMYDKDEMMATATTIPGEETAIFSAACKKANTWGVFSLTGEQHEEHPHKSPYNTLVLINNEGEIVQKYRKCIPWCPIEGWYPGDRTYVTTGPKGMKISLIICDDGNYPEIWRDCAMRGAELIVRCQGYMYPAKEQQVMMAKTMAWANNCYVAVANAAGFDGVYSYFGHSAIVGFDGRTLGECGEEDMGIQYAQLSVSQIRDARANDQSQNHLFKLLHRGYTGVHNSGDGDKGIADCPFEFYRTWVMDAEKAQSDVEAMTRDTIGVVDCPVGNLPAGASEKE'],
 ['UniRef100_UPI001BD4A459',
  'GSVSAWDEALLIAAIQYPVPVIKVPEDIQVQVRQICKTIDSTKAGYPDLDLIVFPEYSAQGLNTKIWTYDEMLLSLDSPEVDCFRQACIRNDIWGVFSVMERNEDSSQPPYNAAIIINNNGEIALHYRKLQPWVPIEPWMPGNGMPVCEGPKGAKLAVCICHDGMFPELAREAAYKGCNVFIRISGYSTQVNDQWIWTNRTNAWQNLMYTVSVNLAGYDEVFYYFGEGTICNYDGNVIQQGQRNPWEIVTAELFPRLADKARENWALENSIFNLGCRGYVGKPGGERANYLTWVRDLANGEYK'],
 ['UniRef100_UPI000248378F',
  'HGDISSSYDSVGVAVVNYKMPRLHTQDEVLANCNNIAEVIDGMKQGLPGLDLVIFPEYSTHGIMYDSQEMMDTASSIPGPETDIFSEACIRNKVWGVFSLTGERHEQHPDKVPYNTLILMNDQGDIVQKYRKIMPWTPIEGWYPGNCTYVTDGPKGLKISLIICDDGNYPEIWRDCVMKGAELVIRCQGYMYPAKEQQIIVSKAMAWMNNTYVAVANAAGFDGVYSYFGHSAIVGFDGRTLGECGEEEN

In [16]:
prompt1_seqs = [i[1] for i in list(prompt.get_prompt(1))]
prompt2_seqs = [i[1] for i in list(prompt.get_prompt(2))]
prompt3_seqs = [i[1] for i in list(prompt.get_prompt(3))]

print(f"N seqs in prompt1: {len(prompt1_seqs)}, prompt2: {len(prompt2_seqs)} prompt3: {len(prompt3_seqs)}") 
print(f"Seqs found in all 3 prompts: {len(set(prompt1_seqs) & set(prompt2_seqs)  & set(prompt3_seqs))} ")

N seqs in prompt1: 44, prompt2: 44 prompt3: 46
Seqs found in all 3 prompts: 0 


In [17]:
msa.msa_id, prompt.prompt_id

('c8708fa8-e44f-4137-9b12-fec2db2bcb11',
 '83c816d8-a65d-4c6d-b939-d58db669793a')

## Scoring with PoET

In [18]:
seqs = [i.encode() for i in dataset.sequence] # prepare seqs from our dataset

In [19]:
scorejob = session.poet.score(prompt.prompt_id, queries=seqs )

In [20]:
score_results = scorejob.wait()
score_results[0]

PoetScoreResult(sequence=b'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA', score=[-67.385009765625, -161.78848266601562, -173.0670166015625], name='sequence-01')

## Single site analysis with PoET

A similar flow yields a single site mutation analysis of a sequence:

In [21]:
sspjob   = session.poet.single_site(prompt, sequence="AAPLAA".encode())

In [22]:
ssp_results = sspjob.wait()
ssp_results[0:3]

[PoetSSPResult(sequence=b'input', score=[-28.7412109375, -28.05859375, -28.6044921875], name=None),
 PoetSSPResult(sequence=b'A1R', score=[-30.0703125, -29.6171875, -29.744140625], name=None),
 PoetSSPResult(sequence=b'A1N', score=[-30.44921875, -30.638671875, -31.3056640625], name=None)]

## Generate *de novo* sequences

Lastly, we can use the generation workflow:

In [23]:
genjob  = session.poet.generate(prompt.prompt_id, num_samples=10) #make 10 sequences based on our prompt


In [24]:

gen_results = genjob.wait()
gen_results[0]

PoetScoreResult(sequence=b'NWKMPRFHHSEEIVANCRKVADYVAGLKKGIPGLDLIIFPEYSTEGILYDINEMLSLNTSIPGQETEIFSRACIENKVWGVFSITGERHEDHPNKVPYNTLILINNQGEIVQKYRKMIPWTPIEGWYPGDKTYVSEGPKGLKISLIICDDGNYPEIWRDCAARGAELIVRCQGYMYPACDEQIKIVPVMAWCNNIYAAVANASGNDGVYSYFGHSSVVDFDGRVLGICGTEENSYQYAELSISAIRDARGNWQSQNHLYKLLHRGYTGTINSHEERQGIPECQFEFYKSWVTDPAGTQAKVEELTREAPGVKYAPIAGIPHE', score=[-348.2950134277344, -158.5674591064453, -277.7309875488281], name='generated-sequence-1')

## Resuming work

You can reload a prompt, MSA or PoET job to resume where you left off:

In [25]:
old_msa = session.poet.load_msa_job(msa.msa_id)
old_msa.job

Job(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='c8708fa8-e44f-4137-9b12-fec2db2bcb11', job_type='/align/align', created_date=datetime.datetime(2023, 7, 27, 8, 17, 27, 340881), start_date=None, end_date=datetime.datetime(2023, 7, 27, 8, 17, 27, 350439), prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None)

The same functionality is present:

In [26]:
new_prompt = old_msa.sample_prompt(10)
new_prompt.job

PromptJob(status=<JobStatus.PENDING: 'PENDING'>, job_id='c98c0115-1a15-4055-9a29-9ad89001146a', job_type='/align/prompt', created_date=datetime.datetime(2023, 7, 27, 8, 29, 55, 455136), start_date=None, end_date=None, prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None, msa_id=None, prompt_id='c98c0115-1a15-4055-9a29-9ad89001146a')

In [27]:
oldprompt = session.poet.load_prompt_job(prompt.prompt_id)
oldprompt.job

Job(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='83c816d8-a65d-4c6d-b939-d58db669793a', job_type='/align/prompt', created_date=datetime.datetime(2023, 7, 27, 8, 17, 30, 13621), start_date=datetime.datetime(2023, 7, 27, 8, 19, 44, 182024), end_date=datetime.datetime(2023, 7, 27, 8, 19, 48, 685526), prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None)

In [28]:
old_job = session.poet.load_poet_job(sspjob.job.job_id)
old_job.get()[0:3]

[PoetSSPResult(sequence=b'input', score=[-28.7412109375, -28.05859375, -28.6044921875], name=None),
 PoetSSPResult(sequence=b'A1R', score=[-30.0703125, -29.6171875, -29.744140625], name=None),
 PoetSSPResult(sequence=b'A1N', score=[-30.44921875, -30.638671875, -31.3056640625], name=None)]