In [1]:
import json
from pyeed.core import ProteinRecord

# `pyeed` basics - getting sequence data

## Get single sequence based on accession id

In [2]:
matTS = ProteinRecord.get_id("MBP1912539.1")

## Get many sequences based on accession ids

In [4]:
# Load the saved ids from json
with open("ids.json", "r") as f:
    ids = json.load(f)

In [5]:
# Get the protein info for each id
proteins = ProteinRecord.get_ids(ids)

print(f"Loaded {len(ids)} unique UniProt and NCBI protein accession ids.")

Output()

Loaded 1047 unique UniProt and NCBI protein accession ids.


In [7]:
from pyeed.align.pairwise import PairwiseAligner

aligner = PairwiseAligner(mode="global")
sequences = {protein.id: protein.sequence for protein in proteins}

aligner.align_multipairwise(sequences)

Output()

[{'score': 85.0,
  'identity': 0.3096446700507614,
  'gaps': 375,
  'mismatches': 33,
  'sequences': [{'id': 'Q9YBK2',
    'sequence': 'MARRIVVESYPYPRVEDLQVELVERKGLGHPDTICDAAAEAVSRELSKYYLERFGKILHHNVDKVLLVGGQAAPRLGGGEVLQPIYILVSGRVTTEVRTGGGVESVPVGPIILRAVKNYIRENFRFLDPEEHVIVDYRVGRGSVDLVGIFEAEDKVPLANDTSIGSGHAPLSTLERLVLETERILNSRETKERLPAVGEDVKVMGVRDGKSITLTVAMAVVSSQVGSVSDYLAVKEEAESLILDLASRIAPDYDVRVNINTGDIPEKKILYLTVTGTSAEHGDDGATGRGNRVNGLITPMRPMSMEAAAGKNPVNHVGKIYNVVANEMAALIHREVKGVEEVYVKLVSQIGKPIDRPRIVDVKVRMEGGREVTADAKREIEAIANSVLDGITGYTEKLVRGDITVY'},
   {'id': 'A5UMW7',
    'sequence': 'MRNIIVKELNQTYIEDIDIEIVERKGIGHPDSISDGIGETVSEALCKMYMDELGGVLHHNTDEVQITAGESNPVFGGGKILKPIDILLTGRGVSEYDGIKFPLDRVAIEAAKNFLDDTIINLDVELDTVVECKIGHGSGDLVDVFKREGAPSSNDTSFGVGYAPFSETETLVKATEELLNSKPFKAKHPAVGEDIKVMGLREGEKITLTIGCAMVSKFVANREEYIAVREELKDIVSDLATKYTNREVEVFVNTADNDDATDESGYYLTVTGTSAEMGDDGSVGRGNRANGLITPCRPMSMEASSGKNPINHVGKIYNILSNEIAKDVVENVEGIKQMNVMILSQIGKPIDQPKAASTQVILEDGVKLEDVDKKVEQIVDRWLEDISIITENVVQGKTRTF'}],
  'aligned_

## Make BLAST search on NCBI server

In [36]:
blast_results = matTS.ncbi_blast(
    n_hits=100,
    e_value=0.05,
    db="swissprot",
    matrix="BLOSUM62",
    identity=0.5,
)

Output()

In [37]:
blast_results

[ProteinRecord(id='WP_048165429.1', uri=None, accession_id=None, name='methionine adenosyltransferase', organism=Organism(id='719a7f92-54db-47f1-88ae-eb25207826aa', taxonomy_id=971279, name='Palaeococcus pacificus', domain='Archaea', kingdom=None, phylum='Euryarchaeota', tax_class='Thermococci', order='Thermococcales', family='Thermococcaceae', genus='Palaeococcus', species=None, json_ld_type=['Organism'], json_ld_context={'Organism': 'https://github.com/PyEED/pyeed/Organism', 'taxonomy_id': 'http://edamontology.org/data_1179', 'name': 'http://edamontology.org/data_2909', 'kingdom': 'http://edamontology.org/data_1044', 'family': 'http://edamontology.org/data_2732', 'genus': 'http://edamontology.org/data_1870', 'species': 'http://edamontology.org/data_1045'}), sequence='MAEKRRNIVVEEIHKTPVEMQKVELVERKGIGHPDSIADGIAEAVSRALSREYIKRYGIILHHNTDQVEVVGGRAYPRFGGGEVIKPIYILLSGRAVELIDRELFPVHEVAIKAAKEYLRKAVRHLNIDEHVVIDSRIGQGSVDLVGVFNKAKENPIPLSNDTSFGVGFAPFSEVEKIVYETEKMLNSDEFKKKFPAVGEDIKVMGLRKGDEIDLTIAAA

## Checkout the data

Exemplarily check out the first protein in the `blast_results`

In [10]:
print(proteins[0])

[4mProteinRecord[0m
├── [94mid[0m = Q9YBK2
├── [94maccession_id[0m = Q9YBK2
├── [94mname[0m = S-adenosylmethionine synthase
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = dc3bc973-3e02-44e7-8605-e412fd547834
│       ├── [94mtaxonomy_id[0m = 272557
│       ├── [94mname[0m = Aeropyrum pernix K1
│       ├── [94mdomain[0m = Archaea
│       ├── [94mphylum[0m = Thermoproteota
│       ├── [94mtax_class[0m = Thermoprotei
│       ├── [94morder[0m = Desulfurococcales
│       ├── [94mfamily[0m = Desulfurococcaceae
│       └── [94mgenus[0m = Aeropyrum
├── [94msequence[0m = MARRIVVESYPYPRVEDLQVELVERKGLGHPDTICDAAAEAVSRELSKYYLERFGKILHHNVDKVLLVGGQAAPRLGGGEVLQPIYILVSGRVTTEVRTGGGVESVPVGPIILRAVKNYIRENFRFLDPEEHVIVDYRVGRGSVDLVGIFEAEDKVPLANDTSIGSGHAPLSTLERLVLETERILNSRETKERLPAVGEDVKVMGVRDGKSITLTVAMAVVSSQVGSVSDYLAVKEEAESLILDLASRIAPDYDVRVNINTGDIPEKKILYLTVTGTSAEHGDDGATGRGNRVNGLITPMRPMSMEAAAGKNPVNHVGKIYNVVANEMAALIHREVKGVEEVYVKLVSQIGKPIDRPRIVDVKVRMEGGREVTADAKREI

In [1]:
import httpx

In [17]:
path = "/Users/max/Downloads/multi.fasta"

files = {"file": open(path, "rb")}
r = httpx.post("http://localhost:5001/clustalo/align", files=files)
print(r.text)

["CLUSTAL O(1.2.4) multiple sequence alignment\n\n\nAEB39622.1:34-394      MTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYR\nXP_039231177.1         MTTYFNYPSKELQDELRDIAQRIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYR\n                       *****************:***:**************************************\n\nAEB39622.1:34-394      QL-FSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSE\nXP_039231177.1         QLLFSTDPKLAENISGVILFHETLYQKADDGTPFADILKKKGIILGIKVDKGVVPLFGSE\n                       ** ********************************:************************\n\nAEB39622.1:34-394      DEVTTQGLDDLAARCAQYKKDG-DFAKWRCVLKIGKNTPSYQSILENANVLARYASICQS\nXP_039231177.1         DEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQS\n                       ********************** *************************************\n\nAEB39622.1:34-394      QRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVY-EGTLLKPNMVTAGQSA\nXP_039231177.1         QRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSA\n       