In [1]:
import json
from pyeed.core import ProteinRecord

# `pyeed` basics - getting sequence data

## Get single sequence based on accession id

Single sequences can be retrieved using the `get_id` function. The function takes an accession id as input and returns the sequence as a `ProteinRecord` object.  
The `ProteinRecord` object contains the sequence as a string and additional information such as information on the `Organism`, `Region` or `Site` annotations of the sequence.


In [2]:
matTS = ProteinRecord.get_id("MBP1912539.1")

## Get many sequences based on accession ids

To load multiple sequences at once, the `get_ids` function can be used. The function takes a list of accession ids as input and returns a list of `ProteinRecord` objects.

In [3]:
# Load the saved ids from json
with open("ids.json", "r") as f:
    ids = json.load(f)

# Get the protein info for each id
proteins = ProteinRecord.get_ids(ids)

Output()

## Make BLAST search on NCBI server

The `ncbi_blast` method can be used to perform a BLAST search on the NCBI server. The method can be applied to a `ProteinRecord` object and returns a list of `ProteinRecord` objects that represent the hits of the BLAST search.
By specifying the `n_hits`, `e_value`, `db`, `matrix`, and `identity`, the search can be customized.

In [4]:
blast_results = matTS.ncbi_blast(
    n_hits=100,
    e_value=0.05,
    db="swissprot",
    matrix="BLOSUM62",
    identity=0.5,
)

Output()

In [None]:
print(f"Recieved {blast_results} blast results")

[ProteinRecord(id='WP_048165429.1', uri=None, accession_id=None, name='methionine adenosyltransferase', organism=Organism(id='719a7f92-54db-47f1-88ae-eb25207826aa', taxonomy_id=971279, name='Palaeococcus pacificus', domain='Archaea', kingdom=None, phylum='Euryarchaeota', tax_class='Thermococci', order='Thermococcales', family='Thermococcaceae', genus='Palaeococcus', species=None, json_ld_type=['Organism'], json_ld_context={'Organism': 'https://github.com/PyEED/pyeed/Organism', 'taxonomy_id': 'http://edamontology.org/data_1179', 'name': 'http://edamontology.org/data_2909', 'kingdom': 'http://edamontology.org/data_1044', 'family': 'http://edamontology.org/data_2732', 'genus': 'http://edamontology.org/data_1870', 'species': 'http://edamontology.org/data_1045'}), sequence='MAEKRRNIVVEEIHKTPVEMQKVELVERKGIGHPDSIADGIAEAVSRALSREYIKRYGIILHHNTDQVEVVGGRAYPRFGGGEVIKPIYILLSGRAVELIDRELFPVHEVAIKAAKEYLRKAVRHLNIDEHVVIDSRIGQGSVDLVGVFNKAKENPIPLSNDTSFGVGFAPFSEVEKIVYETEKMLNSDEFKKKFPAVGEDIKVMGLRKGDEIDLTIAAA

In [None]:
print(proteins[-1])

[4mProteinRecord[0m
├── [94mid[0m = Q9YBK2
├── [94maccession_id[0m = Q9YBK2
├── [94mname[0m = S-adenosylmethionine synthase
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = dc3bc973-3e02-44e7-8605-e412fd547834
│       ├── [94mtaxonomy_id[0m = 272557
│       ├── [94mname[0m = Aeropyrum pernix K1
│       ├── [94mdomain[0m = Archaea
│       ├── [94mphylum[0m = Thermoproteota
│       ├── [94mtax_class[0m = Thermoprotei
│       ├── [94morder[0m = Desulfurococcales
│       ├── [94mfamily[0m = Desulfurococcaceae
│       └── [94mgenus[0m = Aeropyrum
├── [94msequence[0m = MARRIVVESYPYPRVEDLQVELVERKGLGHPDTICDAAAEAVSRELSKYYLERFGKILHHNVDKVLLVGGQAAPRLGGGEVLQPIYILVSGRVTTEVRTGGGVESVPVGPIILRAVKNYIRENFRFLDPEEHVIVDYRVGRGSVDLVGIFEAEDKVPLANDTSIGSGHAPLSTLERLVLETERILNSRETKERLPAVGEDVKVMGVRDGKSITLTVAMAVVSSQVGSVSDYLAVKEEAESLILDLASRIAPDYDVRVNINTGDIPEKKILYLTVTGTSAEHGDDGATGRGNRVNGLITPMRPMSMEAAAGKNPVNHVGKIYNVVANEMAALIHREVKGVEEVYVKLVSQIGKPIDRPRIVDVKVRMEGGREVTADAKREI