In [6]:
%reload_ext autoreload
%autoreload 2
import json
from pyeed.core import ProteinRecord

# `pyeed` basics - getting sequence data

## Get single sequence based on accession id

In [15]:
matTS = ProteinRecord.get_id("MBP1912539.1")

[4mProteinRecord[0m
├── [94mid[0m = MBP1912539.1
├── [94mname[0m = S-adenosylmethionine synthetase
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = d15abc06-9337-4103-965b-e49951e619e4
│       ├── [94mtaxonomy_id[0m = 49900
│       ├── [94mname[0m = Thermococcus stetteri
│       ├── [94mdomain[0m = Archaea
│       ├── [94mphylum[0m = Euryarchaeota
│       ├── [94mtax_class[0m = Thermococci
│       ├── [94morder[0m = Thermococcales
│       ├── [94mfamily[0m = Thermococcaceae
│       └── [94mgenus[0m = Thermococcus
├── [94msequence[0m = MLMAEKIRNIVVEEMVRTPVEMQQVELVERKGIGHPDSIADGIAEAVSRALSREYMKRYGIILHHNTDQVEVVGGRAYPQFGGGEVIKPIYILLSGRAVEMVDREFFPVHEVAIKAAKDYLKKAVRHLDIENHVVIDSRIGQGSVDLVGVFNKAKKNPIPLANDTSFGVGYAPLSETERIVLETEKYLNSDEFKKKWPAVGEDIKVMGLRKGDEIDLTIAAAIVDSEVDNPDDYMAVKEAIYEAAKEIVESHTQRPTNIYVNTADDPKEGIYYITVTGTSAEAGDDGSVGRGNRVNGLITPNRHMSMEAAAGKNPVSHVGKIYNILSMLIANDIAEQIEGVEEVYVRILSQIGKPIDEPLVASVQIIPKKGYSIDVLQKPAYEIADEWLANITKIQKMILEDKINVF
├──

## Get many sequences based on accession ids

In [8]:
# Load the saved ids from json
with open("ids.json", "r") as f:
    ids = json.load(f)

print(ids)

['MCU0861666', 'MCG2909168', 'HDO20193', 'QLH74105', 'RLF08155', 'WP_048099998', 'MCC7566513', 'PSQ10925', 'WP_008383527', 'MCJ7607402', 'WP_188877951', 'MCC6009515', 'WP_179919152', 'WP_253480477', 'WP_121819036', 'NPA14273', 'RAP48284', 'MCH3977412', 'WP_058370489', 'MBS7382949', 'MCL5675329', 'MCC7566529', 'MCE4602106', 'WP_188787152', 'NOZ89357', 'MCK4243364', 'MCL2295950', 'WP_006077501', 'WP_148860060', 'WP_255167024', 'NVM55387', 'WP_012963252', 'HHQ45179', 'WP_128693716', 'WP_159485710', 'WP_136715095', 'NPA47376', 'MBU2617480', 'WP_224424997', 'MCL5100680', 'OPY20232', 'WP_274870358', 'WP_263245494', 'MCA9702766', 'WP_255195886', 'MCI4360054', 'MCS7127230', 'WP_013826834', 'MQY68546', 'WP_179268991', 'MCS7132715', 'MCU0859210', 'WP_050048754', 'MCJ7516544', 'MCL2142833', 'MCE4623710', 'KXA95705', 'PSP28062', 'MCJ2520501', 'MCK9441463', 'RLE89012', 'NHW22645', 'CAD7775635', 'WP_245312871', 'WP_066796611', 'WP_250875294', 'MCL4308458', 'MCE4625612', 'WP_193436070', 'NYZ79728', '

In [9]:
# Get the protein info for each id
proteins = ProteinRecord.get_ids(ids)

print(f"Loaded {len(ids)} unique UniProt and NCBI protein accession ids.")

Output()

Loaded 1047 unique UniProt and NCBI protein accession ids.


## Make BLAST search on NCBI server

In [12]:
blast_results = matTS.ncbi_blast(
    n_hits=100,
    e_value=0.05,
    db="swissprot",
    matrix="BLOSUM62",
    identity=0.5,
)

Output()

In [13]:
blast_results

[ProteinRecord(id='05203c1c-c747-434a-9eed-b861406a3f73', uri=None, accession_id=None, name='methionine adenosyltransferase', organism=Organism(id='20ec6396-465d-4c20-9a65-e3792cd66055', taxonomy_id=187880, name='Thermococcus radiotolerans', domain='Archaea', kingdom=None, phylum='Euryarchaeota', tax_class='Thermococci', order='Thermococcales', family='Thermococcaceae', genus='Thermococcus', species=None, json_ld_type=['Organism'], json_ld_context={'Organism': 'https://github.com/PyEED/pyeed/Organism', 'taxonomy_id': 'http://edamontology.org/data_1179', 'name': 'http://edamontology.org/data_2909', 'kingdom': 'http://edamontology.org/data_1044', 'family': 'http://edamontology.org/data_2732', 'genus': 'http://edamontology.org/data_1870', 'species': 'http://edamontology.org/data_1045'}), sequence='MAEKVRNIVVEELMRTPVEMQKVELVERKGIGHPDSIADGIAEAVSRALSREYIKRYGIILHHNTDQVEVVGGRAYPKFGGGEVIKPIYILLSGRAVEMVDREFFPVHEIAIRAAKDYLKKAVRHLDLENHVVIDSRIGQGSVDLVGVFNKAKENPIPLANDTSFGVGYAPLSETERIVLETERLLNSDEFKRK

## Checkout the data

Exemplarily check out the first protein in the `blast_results`

In [None]:
result1 = blast_results[0]

### Organism annotation

In [None]:
print(proteins[-1].regions)

[ProteinRegion(id='proteinregion1385', name='MetK2', spans=[Span(id='span2731', start=0, end=401)], note='Archaeal S-adenosylmethionine synthetase [Coenzyme transport and metabolism]; COG1812', cross_reference='CDD:441417', type=None)]


### Domain annotations

In [None]:
for domain in blast_results[4].regions:
    print(domain)

[4mProteinRegion[0m
├── [94mid[0m = proteinregion727
├── [94mname[0m = MetK2
├── [94mspans[0m
│   └── 0
│       └── [4mSpan[0m
│           ├── [94mid[0m = span1392
│           ├── [94mstart[0m = 5
│           └── [94mend[0m = 405
├── [94mnote[0m = Archaeal S-adenosylmethionine synthetase [Coenzyme transport and metabolism]; COG1812
└── [94mcross_reference[0m = CDD:441417

