In [6]:
import json
from pyeed.core import ProteinInfo

# `pyeed` basics - getting sequence data

## Get single sequence based on accession id

In [7]:
matTS = ProteinInfo.get_id("MBP1912539.1")

## Get many sequences based on accession ids

In [8]:
# Load the saved ids from json
with open("ids.json", "r") as f:
    ids = json.load(f)

# Get the protein info for each id
proteins = ProteinInfo.get_ids(ids)

print(f"Loaded {len(ids)} unique UniProt and NCBI protein accession ids.")


Output()

Loaded 1047 unique UniProt and NCBI protein accession ids.


## Make BLAST search on NCBI server

In [5]:
blast_results = matTS.ncbi_blast(
    n_hits=100,
    e_value=0.05,
    db="swissprot",
    matrix="BLOSUM62",
    identity=0.5,
)

Output()

## Checkout the data

Exemplarily check out the first protein in the `blast_results`

In [9]:
result1 = blast_results[0]

### Organism annotation

In [12]:
print(proteins[-1].regions)

[ProteinRegion(id='proteinregion1385', name='MetK2', spans=[Span(id='span2731', start=0, end=401)], note='Archaeal S-adenosylmethionine synthetase [Coenzyme transport and metabolism]; COG1812', cross_reference='CDD:441417', type=None)]


### Domain annotations

In [15]:
for domain in blast_results[4].regions:
    print(domain)

[4mProteinRegion[0m
├── [94mid[0m = proteinregion727
├── [94mname[0m = MetK2
├── [94mspans[0m
│   └── 0
│       └── [4mSpan[0m
│           ├── [94mid[0m = span1392
│           ├── [94mstart[0m = 5
│           └── [94mend[0m = 405
├── [94mnote[0m = Archaeal S-adenosylmethionine synthetase [Coenzyme transport and metabolism]; COG1812
└── [94mcross_reference[0m = CDD:441417

