In [6]:
import json
from pyeed.core import ProteinRecord

# `pyeed` basics - getting sequence data

## Get single sequence based on accession id

Single sequences can be retrieved using the `get_id` function. The function takes an accession id as input and returns the sequence as a `ProteinRecord` object.  
The `ProteinRecord` object contains the sequence as a string and additional information such as information on the `Organism`, `Region` or `Site` annotations of the sequence.


In [7]:
matTS = ProteinRecord.get_id("MBP1912539.1")

## Get many sequences based on accession ids

To load multiple sequences at once, the `get_ids` function can be used. The function takes a list of accession ids as input and returns a list of `ProteinRecord` objects.

In [8]:
# Load the saved ids from json
with open("ids.json", "r") as f:
    ids = json.load(f)

# Get the protein info for each id
proteins = ProteinRecord.get_ids(ids)

Output()

## Make BLAST search on NCBI server

The `ncbi_blast` method can be used to perform a BLAST search on the NCBI server. The method can be applied to a `ProteinRecord` object and returns a list of `ProteinRecord` objects that represent the hits of the BLAST search.
By specifying the `n_hits`, `e_value`, `db`, `matrix`, and `identity`, the search can be customized.

In [9]:
blast_results = matTS.ncbi_blast(
    n_hits=100,
    e_value=0.05,
    db="swissprot",
    matrix="BLOSUM62",
    identity=0.5,
)

Output()

In [10]:
print(f"Recieved {blast_results} blast results")

Recieved [ProteinRecord(id='WP_297062393.1', uri=None, accession_id=None, name='methionine adenosyltransferase', organism=Organism(id='11cfe46f-fbde-40ba-b670-2e3484c9d40f', taxonomy_id=35749, name='Thermococcus sp.', domain='Archaea', kingdom=None, phylum='Euryarchaeota', tax_class='Thermococci', order='Thermococcales', family='Thermococcaceae', genus='Thermococcus', species=None, json_ld_type=['Organism'], json_ld_context={'Organism': 'https://github.com/PyEED/pyeed/Organism', 'taxonomy_id': 'http://edamontology.org/data_1179', 'name': 'http://edamontology.org/data_2909', 'kingdom': 'http://edamontology.org/data_1044', 'family': 'http://edamontology.org/data_2732', 'genus': 'http://edamontology.org/data_1870', 'species': 'http://edamontology.org/data_1045'}), sequence='MAEKVRNIVVEELVRTPVEMQKVELVERKGIGHPDSIADGIAEAVSRALSREYIKRYGIILHHNTDQVEVVGGRAYPQFGGGEVIKPIYILLSGRAVEMVDREFFPVHEVAIKAAKDYLRKAVRHLDIENHVVIDSRIGQGSVDLVGVFNKAKENPIPLANDTSFGVGYAPLSETEKIVLETEKALNSDDFKKEWPAVGEDIKVMGLRRGDEIDLTIA

In [11]:
print(proteins[-1])

[4mProteinRecord[0m
├── [94mid[0m = MBU7022768.1
├── [94mname[0m = methionine adenosyltransferase
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = 28d6f810-c2ac-412e-aae9-55c165f2826e
│       ├── [94mtaxonomy_id[0m = 2747605
│       ├── [94mname[0m = Theionarchaea archaeon
│       ├── [94mdomain[0m = Archaea
│       ├── [94mphylum[0m = Euryarchaeota
│       └── [94mtax_class[0m = Theionarchaea
├── [94msequence[0m = MNITVETLSRLPLEEQDIEIVERKGIGHPDSICDGIAESVSRSLSRCYIEECGRILHHNTDQVELVGGKSNPFFGGGEILQPMYLLLSGRATMHFNDGEQRSVPTHRIAIQAAREYLEDNLPNCDVDSHVVIDSRMGEGSVDLKENFEEDQVVPRANDTSFGVCFAPLTETEKLVYNAERFLNSADFKKKFPMLGEDIKIMGLRTKNEIQLMVAAAFISSRVDTRQEYERLKEEINEEIIEEFSEQFSRKLKVDINTADTGHSAYLTVTGTSSEMGDDGSVGRGNRVTGLITPYRPMSMEAAAGKNPVSHVGKIYNLLARRIAEEVAELEGVAEVQIYLLSQIGHPINDPAEACARLVTKSHTSARELEPEIKEIMSRNIKSVTQITNLVVEGSLDVF
├── [94mcoding_sequence[0m
│   └── 0
│       └── [4mRegion[0m
│           ├── [94mid[0m = JACVGX010000021.1
│           ├── [94mstart