In [1]:
%reload_ext autoreload
%autoreload 2
import json
from pyeed.core import ProteinRecord

# `pyeed` basics - getting sequence data

## Get single sequence based on accession id

In [12]:
matTS = ProteinRecord.get_id("MBP1912539.1")

## Get many sequences based on accession ids

In [13]:
# Load the saved ids from json
with open("ids.json", "r") as f:
    ids = json.load(f)

print(ids)

['MCU0861666', 'MCG2909168', 'HDO20193', 'QLH74105', 'RLF08155', 'WP_048099998', 'MCC7566513', 'PSQ10925', 'WP_008383527', 'MCJ7607402', 'WP_188877951', 'MCC6009515', 'WP_179919152', 'WP_253480477', 'WP_121819036', 'NPA14273', 'RAP48284', 'MCH3977412', 'WP_058370489', 'MBS7382949', 'MCL5675329', 'MCC7566529', 'MCE4602106', 'WP_188787152', 'NOZ89357', 'MCK4243364', 'MCL2295950', 'WP_006077501', 'WP_148860060', 'WP_255167024', 'NVM55387', 'WP_012963252', 'HHQ45179', 'WP_128693716', 'WP_159485710', 'WP_136715095', 'NPA47376', 'MBU2617480', 'WP_224424997', 'MCL5100680', 'OPY20232', 'WP_274870358', 'WP_263245494', 'MCA9702766', 'WP_255195886', 'MCI4360054', 'MCS7127230', 'WP_013826834', 'MQY68546', 'WP_179268991', 'MCS7132715', 'MCU0859210', 'WP_050048754', 'MCJ7516544', 'MCL2142833', 'MCE4623710', 'KXA95705', 'PSP28062', 'MCJ2520501', 'MCK9441463', 'RLE89012', 'NHW22645', 'CAD7775635', 'WP_245312871', 'WP_066796611', 'WP_250875294', 'MCL4308458', 'MCE4625612', 'WP_193436070', 'NYZ79728', '

In [23]:
# Get the protein info for each id
proteins = ProteinRecord.get_ids(ids[:100])

print(f"Loaded {len(ids)} unique UniProt and NCBI protein accession ids.")

Output()

Loaded 1047 unique UniProt and NCBI protein accession ids.


In [29]:
from pyeed.align.pairwise import PairwiseAligner

aligner = PairwiseAligner(mode="global")
sequences = {protein.id: protein.sequence for protein in proteins}

aligner.align_multipairwise(sequences)

Output()

[{'score': 108.0,
  'identity': 0.25307125307125306,
  'gaps': 567,
  'mismatches': 41,
  'sequences': [{'id': 'MCU0859210.1',
    'sequence': 'MQRNIFIEELTHTPIEKQPIEFVERKGIGHPDSIADGLSESVSRALCKMYLERYGRILHHNTDETQIVGGQSAPKFGGGAILEPIYILLVGRATTSVNGERLPYRTTAIKAAYEYLQKTCKNLDVDWDVVLDCMIGQGSVDLRGLYETKRGLANDTSFGCGFAPFSETEKITLETERYINGKLCRKMPEIGEDVKVMACRRNDAIDVTVAAAMVDSKIPDKDHYKSVIQECRDDITDFAQKMTKRKVRVFVNTADDYSKDIYYLTVSGLSMENGDDGSVGRGNRSNGLITPMRPMSMEASAGKNPVTHVGKLYNLLSNQVAAEIYKAGRGDIVEVHTRILSQIGKPIDEPQAASANLIFAQGADVKKLEKDARAIFDTYLENIDRLTDQIVSGKLTVF'},
   {'id': 'GCF16199.1',
    'sequence': 'MSHSNFVSASIQQSSIISDSVGLGKSFIGSELLYDYRHDGKHCLLIVPANLTDQWEDLLQDATDEDGNPFFGLEIDETHLEVMSISKFQNLSYDEVQGLREQFDVLLVDEAHRFRNHGRWRSNPDDDDDYKGTRRHANLRLLRDKTMILLTATPLNNSATDLKNPIGLFTEKNEIRNKANLDFGSFDRYIDIAEQRKRVASGKEEMEPADLEDLTEKLQHEAEEISEILNEVMVLRTRKFLRVIFQPRAVPVSYRSGHSKERPMTERNIHVAPESGLPVEDQNVEIVERKGIGHPDSICDGVAESVSRRLAQTYIDRVGKVLHYNTDETQLVAGNAAPAYGGGEVLEPIYFLIVGRATKEYQGTHIPAESIALEAARDYLGEHFPHLDLDTDVIIDVELGEGSGDLQTVFGEE

## Make BLAST search on NCBI server

In [5]:
blast_results = matTS.ncbi_blast(
    n_hits=100,
    e_value=0.05,
    db="swissprot",
    matrix="BLOSUM62",
    identity=0.5,
)

Output()

In [6]:
blast_results

[ProteinRecord(id='WP_297091281.1', uri=None, accession_id=None, name='methionine adenosyltransferase', organism=Organism(id='72bee40a-d9e9-4b77-995a-706fec4081f6', taxonomy_id=35749, name='Thermococcus sp.', domain='Archaea', kingdom=None, phylum='Euryarchaeota', tax_class='Thermococci', order='Thermococcales', family='Thermococcaceae', genus='Thermococcus', species=None, json_ld_type=['Organism'], json_ld_context={'Organism': 'https://github.com/PyEED/pyeed/Organism', 'taxonomy_id': 'http://edamontology.org/data_1179', 'name': 'http://edamontology.org/data_2909', 'kingdom': 'http://edamontology.org/data_1044', 'family': 'http://edamontology.org/data_2732', 'genus': 'http://edamontology.org/data_1870', 'species': 'http://edamontology.org/data_1045'}), sequence='MAEKVRNIVVEELMRTPVEMQKVELVERKGIGHPDSIADGIAEAVSRALSREYIKRYGIILHHNTDQVEVVGGKAYPRFGGGEVIKPIYILLSGRAVEIVDRELFPVHEVAIKAAREYLRRAVRHLDLEHHVVIDSRIGQGSVDLVGVFNKAKENPIPLANDTSFGVGYAPLSETERIVLETERLLNSDEFKRKYPAVGEDIKVMGLRKGDEIDLTIAAAIVDSEVD

## Checkout the data

Exemplarily check out the first protein in the `blast_results`

In [7]:
result1 = blast_results[0]

### Organism annotation

In [8]:
print(proteins[-1].regions)

[Region(id='MetK2', start=0, end=400, json_ld_type=['Region'], json_ld_context={'Region': 'https://github.com/PyEED/pyeed/Region'})]


### Domain annotations

In [9]:
for domain in blast_results[4].regions:
    print(domain)

[4mRegion[0m
├── [94mid[0m = MetK2
├── [94mstart[0m = 5
└── [94mend[0m = 405



In [2]:
mat_accessions = "ANG21639,ANG12548,WP_129557708,ANG18776,ANG16013,ANG17639,ANG31811,ANG34149,ANG22502".split(
    ","
)

print(mat_accessions)
mats = ProteinRecord.get_ids(mat_accessions)

Output()

['ANG21639', 'ANG12548', 'WP_129557708', 'ANG18776', 'ANG16013', 'ANG17639', 'ANG31811', 'ANG34149', 'ANG22502']


AttributeError: 'Region' object has no attribute '_repo'

In [11]:
starting_protein_tem = ProteinRecord.get_id("ANG21639")

Output()

AttributeError: 'Region' object has no attribute '_repo'