In [4]:
from sdRDM.generator import generate_python_api
# generate_python_api("../../specifications/data_model.md", "../../", "pyEED")

In [5]:
from pyEED.core import ProteinInfo
from pyEED.alignment.pairwise import multi_pairwise_alignment
from pyEED.network import pairwise_network
from pyEED.ncbi.utils import load_accessions

# Visualize networks of pairwise sequence alignments

In the following example, a protein BLAST search is conducted based on TEM-1 and TEM-109 variant of beta-lactamase. After pairwise alignment of all sequences, a network is constructed based on the alignment scores and then visualized.

## PBLAST search for seed sequences

Results from the BLAST search are renamed based to the name of query protein variant.

In [6]:
tem1 = ProteinInfo.from_ncbi("QGC48744.1")
tem109 = ProteinInfo.from_ncbi("AAT46413.1")

blast_results = []
for tem in [tem1, tem109][:1]:
    sequences = tem.pblast(
        e_value=0.05, n_hits=15, api_key="161e6eb71dcc94511d2d0e2fc5336c1af709"
    )

    for sequence in sequences:
        sequence.name = tem.name

    blast_results.extend(sequences)
    blast_results.append(tem)

🏃🏼‍♀️ Running PBLAST
╭── protein name: TEM family beta-lactamase
├── accession: QGC48744.1
├── organism: Escherichia coli
├── e-value: 0.05
╰── max hits: 15


⬇️ Fetching protein sequences: 100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


ValidationError: 1 validation error for ProteinInfo
coding_sequence_ref
  Input should be a valid dictionary or instance of DNARegion [type=model_type, input_value=[], input_type=list]
    For further information visit https://errors.pydantic.dev/2.5/v/model_type

## Pairwise alignment of unique sequence pairs

In [None]:
alignments = multi_pairwise_alignment(
    blast_results,
    mode="global",
    match=1,
    mismatch=-1,
    gap_open=-1,
    gap_extend=0,
    n_jobs=8,
)

⛓️ Aligning sequences: 100%|██████████| 5151/5151 [00:02<00:00, 2022.74it/s]


## Visualize the alignment network

In [None]:
pairwise_network(
    alignments=alignments,
    weight="identity",
    color="name",
    label="organism",
    cutoff=0.98,
)

{'TEM family beta-lactamase': 'blue', 'beta-lactamase TEM-109': 'red'}


In [None]:
from pyEED.core import Organism, Substrate, ProteinRegion, Span

s = ProteinInfo(sequence="sdfsfg", organism=Organism(
    name="E. coli", taxonomy_id="83333"))

s.substrates.append(Substrate(name="sucrose"))

region = ProteinRegion(name="TM1", start=1, end=10)

print(s)

KeyError: 'start'

In [None]:
r = ProteinRegion(
    name="wer",
    note="th ksd sv",
    cross_reference="cross",
    spans=[Span(start=1, end=2)])

TypeError: Subscripted generics cannot be used with class and instance checks

In [None]:
r = ProteinRegion(
    name="wer",
    note="th ksd sv",
    cross_reference="cross",
)
r.add_to_spans(start=1, end=2)

r

