In [21]:
%matplotlib inline

# Demo of PoET functionality 

This notebook will briefly cover how to run `align` and `PoET` workflows. 

For more information please [read the docs](https://docs.openprotein.ai/).

In [22]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import time
import json
import pandas as pd
import seaborn as sns 
sns.set() 

from AWSTools.Batchtools.batch_utils import fakeseq # Used for creating fake protein sequences for testing


## Setup

Connect to the OpenProtein backend with your credentials:

In [23]:
import openprotein_python as openprotein

with open('../../../../secrets.config', 'r') as f:
    config = json.load(f)

session = openprotein.connect(username= config['username'], password= config['password']) 

In [24]:
dataset = pd.read_csv("./data/core.csv")[['sequence']]
dataset.head(2)

Unnamed: 0,sequence
0,WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMK...
1,WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMK...


## Create an MSA 

We can create an MSA either from a seed, or by uploading a ready-made file. Here we will explore the seed workflow:

In [25]:
# Create an MSA from a seed sequence
seed = dataset.sequence[0]

Start a ColabFold job to create an MSA:

In [26]:
msa = session.poet.create_msa(seed.encode())
print(msa)



status=<JobStatus.SUCCESS: 'SUCCESS'> job_id='6e67fcfd-016a-4b4e-8a0f-654608294dc9' job_type='/align/align' created_date=datetime.datetime(2023, 8, 4, 4, 10, 0, 147106) start_date=None end_date=datetime.datetime(2023, 8, 4, 4, 10, 0, 159244) prerequisite_job_id=None progress_message=None progress_counter=None num_records=None msa_id='6e67fcfd-016a-4b4e-8a0f-654608294dc9'


In [27]:
r = msa.wait() 
list(r)[0:3]

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI0004660BEB',
  '-RHGDISSSNDTVGVAVVNYKMPRLHTVAEVLDNARKIADMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARSNDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI000730B3B9',
  '-RHGDISSSNDTVGVAVVNYKMPRLHSREEVLANAQKIADMVVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMMETAVAIPGDETELLARACRKANVWGVFSLTGERHEEHPNKAPYNTLVLIDNKGEVVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISMIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNTYVAVANAAGF

We can examine our inputs:

In [28]:
list(msa.get_input("RAW"))

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA']]

and the resulting MSA (limited here to 4 sequences for brevity):

In [29]:
list(msa.get_input("GENERATED"))[0:4]

[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI0004660BEB',
  '-RHGDISSSNDTVGVAVVNYKMPRLHTVAEVLDNARKIADMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARSNDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI000730B3B9',
  '-RHGDISSSNDTVGVAVVNYKMPRLHSREEVLANAQKIADMVVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMMETAVAIPGDETELLARACRKANVWGVFSLTGERHEEHPNKAPYNTLVLIDNKGEVVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISMIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNTYVAVANAAGF


## Prompt 

We can use this MSA to create a prompt with a sampling regime (see the docs for details):

In [30]:
prompt = msa.sample_prompt(num_ensemble_prompts=3, random_seed=42)


In [31]:
prompt.id # or prompt.job.job_id

'2ee83e11-6027-4425-a352-1e0a1de64097'

In [32]:
# We can wait, or not, for the prompt to create
prompt.wait() #not necessary but can

<_csv.reader at 0x7f4e8c0785f0>

As we specified 3 prompts above we will have 3 different prompts all drawn from the same MSA:

In [33]:
list(prompt.get_prompt(1))[0:3]

[['UniRef100_A0A959K4C9',
  'GLMICYDTRFPEMARSLALAGAEIIIVPTAWPFPRVEHWQLLSRARAIENQCYVVTANRVGKDGQAIFCGNSRVIDPHGVVVSSASEDQEEIIYAEIKRDKLDFIRTRMPVFEHRRPDVY'],
 ['UniRef100_UPI00041A74DE',
  'GSVSAWDEALLIAAIQYPVPVIKRPEDIQVQVQQICKTIDSTKAGYPDLDLIVFPEYSAQGLNTKIWTYDEMLLSLESPEVDSFRQACIRNNIWGVFSLMERNEDPSQPPYNTAIIINNSGEIVLHYRKLQPWVPIEPWMPGNGMPVCGGPKGAKLAVCICHDGMFPELAREAAYKGCNVFIRISGYSTQVNDQWIWTNRTNAWQNLMYTVSVNLAGYDE'],
 ['UniRef100_A0A7W9FMQ2',
  'GGLNKSENGVVIGLVQLQLPVTVTRDDLARQTKRIVELVGKARRNNAGMDLVVFPEYALHGLSMDTNPAIMCDLDGPEVAAFKAACAEHRIWGCFSIMERNPGGNPYNSGIVIDDQGALKLYYRKLHPWVPVEPWEPGDGIPVIDGPKGAKLALIICHDGMFPEMARECAYKGAEIMIRTAGYTAPIRESWRFTNQANAFQNLMVTANVCMCGSDGTFDSMGEGMIVNFDGTVIAHGVTGRPEIITAEVRPDLVREARAGWGVENNIYQLWHRGYVAVKGGAMDCPYTFMQDMVAG']]

In [34]:
list(prompt.get_prompt(2))[0:3]

[['UniRef100_A0A194RN05',
  'FNTHIIIDNKGDIVQTYRKLHLFDESDFTSPGSHVVTPVDTPVGRIGLEICYDMRFPELSTTLGSMRADILTFPSAFTYTGMAHWHLLLRARAIENQCYVLAAAQTGHNAKRRSYGHALCVDPWGEVLADCEEEGPCYKIAEISLEKLADVRRNMPVFQHR'],
 ['UniRef100_A0A7W0G9W8',
  'GGSAILGPDGAYLAGPLYDEEGILYAELDPTRLAEERQRDPAGHYHRPDV'],
 ['UniRef100_A0A6F9EEE2',
  'RHGDISSSPDTVGVAVVNYKMPRLHTREQVLDNARKIADMIVGMKQGLPGMDLVVFPEYSTMGIMYDPDEMFETACTVPGEETEIFGRACREANTWGVFSLTGERHEEHPRKSPYNTLVLINNRGEIVQKYRKILPWAPIEGWYPGDKTYVSDGPKGLKVSLIICDDGNYPEIWRDCAMKGAELIVRPQGYMYPAKEQQIMMAKTMAWANNVYVAVANATGFDGVYSYFGHSAIIGFDGRTLGECGEEEYGIQYAELSISAIRDARQNWQSQNQLFKLLHRGYTGIYNSGDGDKGLAECPFDFYRTWVLDAKKAQENVEKITRTELTTACCPVGGLPYNGAEREA']]

In [35]:
list(prompt.get_prompt(3))[0:3]

[['UniRef100_UPI0009488FB3',
  'RHGDISSSPDTVGVAVVNYKMPRLHTKSDVLANAEQIADMIIGIKQGLPGMDLIVFPEYSTMGIMYDKDEMMATATTIPGEETAIFSAACKKANTWGVFSLTGEQHEEHPHKSPYNTLVLINNEGEIVQKYRKCIPWCPIEGWYPGDRTYVTTGPKGMKISLIICDDGNYPEIWRDCAMRGAELIVRCQGYMYPAKEQQVMMAKTMAWANNCYVAVANAAGFDGVYSYFGHSAIVGFDGRTLGECGEEDMGIQYAQLSVSQIRDARANDQSQNHLFKLLHRGYTGVHNSGDGDKGIADCPFEFYRTWVMDAEKAQSDVEAMTRDTIGVVDCPVGNLPAGASEKE'],
 ['UniRef100_UPI001BD4A459',
  'GSVSAWDEALLIAAIQYPVPVIKVPEDIQVQVRQICKTIDSTKAGYPDLDLIVFPEYSAQGLNTKIWTYDEMLLSLDSPEVDCFRQACIRNDIWGVFSVMERNEDSSQPPYNAAIIINNNGEIALHYRKLQPWVPIEPWMPGNGMPVCEGPKGAKLAVCICHDGMFPELAREAAYKGCNVFIRISGYSTQVNDQWIWTNRTNAWQNLMYTVSVNLAGYDEVFYYFGEGTICNYDGNVIQQGQRNPWEIVTAELFPRLADKARENWALENSIFNLGCRGYVGKPGGERANYLTWVRDLANGEYK'],
 ['UniRef100_UPI000248378F',
  'HGDISSSYDSVGVAVVNYKMPRLHTQDEVLANCNNIAEVIDGMKQGLPGLDLVIFPEYSTHGIMYDSQEMMDTASSIPGPETDIFSEACIRNKVWGVFSLTGERHEQHPDKVPYNTLILMNDQGDIVQKYRKIMPWTPIEGWYPGNCTYVTDGPKGLKISLIICDDGNYPEIWRDCVMKGAELVIRCQGYMYPAKEQQIIVSKAMAWMNNTYVAVANAAGFDGVYSYFGHSAIVGFDGRTLGECGEEEN

In [36]:
prompt1_seqs = [i[1] for i in list(prompt.get_prompt(1))]
prompt2_seqs = [i[1] for i in list(prompt.get_prompt(2))]
prompt3_seqs = [i[1] for i in list(prompt.get_prompt(3))]

print(f"N seqs in prompt1: {len(prompt1_seqs)}, prompt2: {len(prompt2_seqs)} prompt3: {len(prompt3_seqs)}") 
print(f"Seqs found in all 3 prompts: {len(set(prompt1_seqs) & set(prompt2_seqs)  & set(prompt3_seqs))} ")

N seqs in prompt1: 44, prompt2: 44 prompt3: 46
Seqs found in all 3 prompts: 0 


In [37]:
msa.msa_id, prompt.prompt_id

('6e67fcfd-016a-4b4e-8a0f-654608294dc9',
 '2ee83e11-6027-4425-a352-1e0a1de64097')

## Scoring with PoET

In [38]:
seqs = [i.encode() for i in dataset.sequence] # prepare seqs from our dataset

In [39]:
scorejob = session.poet.score(prompt.prompt_id, queries=seqs )

In [40]:
score_results = scorejob.wait()
score_results[0]

PoetScoreResult(sequence=b'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA', score=[-67.385009765625, -161.78848266601562, -173.0670166015625], name='sequence-01')

## Single site analysis with PoET

A similar flow yields a single site mutation analysis of a sequence:

In [41]:
sspjob   = session.poet.single_site(prompt, sequence="AAPLAA".encode())

In [42]:
ssp_results = sspjob.wait()
ssp_results[0:3]

[PoetSSPResult(sequence=b'input', score=[-28.7412109375, -28.05859375, -28.6044921875], name=None),
 PoetSSPResult(sequence=b'A1R', score=[-30.0703125, -29.6171875, -29.744140625], name=None),
 PoetSSPResult(sequence=b'A1N', score=[-30.44921875, -30.638671875, -31.3056640625], name=None)]

## Generate *de novo* sequences

Lastly, we can use the generation workflow:

In [43]:
genjob  = session.poet.generate(prompt.prompt_id, num_samples=10) #make 10 sequences based on our prompt


In [44]:

gen_results = genjob.wait()
gen_results[0]

PoetScoreResult(sequence=b'AGLNGPGIDLVVFPELHLFGGNNPSAMLQASAEGIDGPRVKALQALAKDLNIWLVPGSVCEHGPNGQLFNTQLVLSPDGELAGYYRKIFPWRPFEPYDPGDRFTTVDLPGVGRVGLNICYDAWYPEVSRQLAWMGAEVILNVVKTTTPDRKQELILAKANAIVNQVFMVSVNCAGPTGQGKSIIVDPEGNTLVEAPDDQPQLLTAELDLAAVDQVRTH', score=[-106.92665100097656, -107.2594223022461, -98.85879516601562], name='generated-sequence-1')

## Resuming work

You can reload a prompt, MSA or PoET job to resume where you left off:

In [45]:
old_msa = session.poet.load_msa_job(msa.msa_id)
old_msa.job

Job(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='6e67fcfd-016a-4b4e-8a0f-654608294dc9', job_type='/align/align', created_date=datetime.datetime(2023, 8, 4, 4, 10, 0, 147106), start_date=None, end_date=datetime.datetime(2023, 8, 4, 4, 10, 0, 159244), prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None)

The same functionality is present:

In [46]:
new_prompt = old_msa.sample_prompt(10)
new_prompt.job

PromptJob(status=<JobStatus.PENDING: 'PENDING'>, job_id='024fd73a-6a8a-4c98-8ea0-7c9aeb53659a', job_type='/align/prompt', created_date=datetime.datetime(2023, 8, 4, 4, 15, 0, 105720), start_date=None, end_date=None, prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None, msa_id=None, prompt_id='024fd73a-6a8a-4c98-8ea0-7c9aeb53659a')

In [47]:
oldprompt = session.poet.load_prompt_job(prompt.prompt_id)
oldprompt.job

Job(status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='2ee83e11-6027-4425-a352-1e0a1de64097', job_type='/align/prompt', created_date=datetime.datetime(2023, 8, 4, 4, 10, 2, 968626), start_date=datetime.datetime(2023, 8, 4, 4, 10, 12, 777392), end_date=datetime.datetime(2023, 8, 4, 4, 10, 17, 327525), prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None)

In [48]:
old_job = session.poet.load_poet_job(sspjob.job.job_id)
old_job.get()[0:3]

[PoetSSPResult(sequence=b'input', score=[-28.7412109375, -28.05859375, -28.6044921875], name=None),
 PoetSSPResult(sequence=b'A1R', score=[-30.0703125, -29.6171875, -29.744140625], name=None),
 PoetSSPResult(sequence=b'A1N', score=[-30.44921875, -30.638671875, -31.3056640625], name=None)]