# Example 1: Get protein information, run BLAST, and dump the results into a database


In [30]:
from sdRDM.generator import generate_python_api

# generate_python_api("../specifications/data_model.md", "..", "pyEED")

In [1]:
%reload_ext autoreload
%autoreload 2
from pyEED.core import ProteinInfo
from pyEED.ncbi.utils import get_nucleotide_sequences

## Query NCBI

The pyEED library is centered around the `ProteinSequence` object, which integrates available information on protein sequence, corresponding nucleotide sequence, as well as regions and sites within the sequences. The `ProteinSequence` can be initialized directly with a protein sequence accession number.

In [38]:
aldolase = ProteinInfo.from_ncbi("NP_001287541.1")
print(aldolase)

unmatched site type: other
[4mProteinInfo[0m
├── [94mid[0m = proteininfo2
├── [94msource_id[0m = NP_001287541.1
├── [94mname[0m = aldolase 1, isoform M
├── [94msequence[0m = MTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYRQLLFSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSEDEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQSQRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSAKKNTPEEIALATVQALRRTVPAAVTGVTFLSGGQSEEEATVNLSAINNVPLIRPWALTFSYGRALQASVLRAWAGKKENIAAGQNELLKRAKANGDAAQGKYVAGSAGAGSGSLFVANHAY
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = organism3
│       ├── [94mname[0m = Drosophila melanogaster
│       ├── [94mtaxonomy_id[0m = taxon:7227
│       ├── [94mdomain[0m = Eukaryota
│       ├── [94mkingdom[0m = Metazoa
│       ├── [94mphylum[0m = Arthropoda
│       ├── [94mtax_class[0m = Insecta
│       ├── [94morder[0m = Diptera
│       ├── [94mfamily[0m = Drosophilidae
│       └── [94mgenus[0m = Dr

In [39]:
aldolase.get_dna()

[4mDNAInfo[0m
├── [94mid[0m = dnainfo2
├── [94mname[0m = Drosophila melanogaster aldolase 1, transcript variant M (Ald1), mRNA
├── [94msequence[0m = AGTCGGTTGCCCAACACCGTCGGCGACGACCTCCAAACAACCTTTTGGGGTCTCGCTCGGCATGTCTCGCTTATTTCGCTTTGGAATTTCACGTTGACGCCGCAGTCAGCGGTAGCAGAGGCATCGGCAGCGGCAGTAGCAGCGACACGGGCCGAAAATAAAAGCGTTAACCGCTCTCCTCCAGTGCAGCAGCAGCAGCGGACCCAGCCAGTAGCCAGCAGCTAATCACCACATCCCAGATTCAGTTTCCAGTTCGAACTACACTCGAATCTCAAAAATGACGACCTACTTCAACTACCCCAGCAAGGAGCTGCAGGATGAGCTGCGCGAAATCGCCCAGAAAATCGTTGCCCCCGGCAAGGGAATCCTCGCCGCCGATGAGTCCGGCCCAACCATGGGCAAGCGTCTGCAGGACATCGGCGTGGAGAACACCGAGGACAACCGCCGTGCCTACCGTCAGCTGTTGTTCAGCACTGACCCCAAGCTGGCCGAGAACATCTCTGGAGTGATCCTGTTCCACGAGACCCTCTACCAGAAGGCCGATGATGGCACCCCCTTCGCCGAGATCCTGAAGAAGAAGGGAATCATTCTGGGCATCAAGGTCGACAAGGGTGTTGTCCCACTGTTCGGCTCTGAGGATGAGGTCACCACCCAGGGTCTGGATGACCTGGCCGCCCGTTGCGCCCAGTACAAGAAGGACGGTTGCGACTTCGCCAAGTGGCGTTGCGTCCTGAAGATCGGCAAGAACACCCCATCCTACCAGTCGATCCTGGAGAACGCCAATGTCCTGGCCCGCTACGCCTCCATCTGCCAGTCGCAGCGCATCGTCCCAATTGTGGAGCC

## BLAST search

In [26]:
blast_results = aldolase.pblast(n_hits=10)

Running pblast search for aldolase 1, isoform M from Drosophila melanogaster...


Fetching protein sequences:   0%|          | 0/10 [00:00<?, ?it/s]

unmatched site type: other


Fetching protein sequences:  10%|█         | 1/10 [00:03<00:27,  3.07s/it]

unmatched site type: other


Fetching protein sequences:  20%|██        | 2/10 [00:05<00:19,  2.43s/it]

unmatched site type: other


Fetching protein sequences:  30%|███       | 3/10 [00:06<00:15,  2.19s/it]

unmatched site type: other


Fetching protein sequences:  40%|████      | 4/10 [00:09<00:13,  2.22s/it]

unmatched site type: other


Fetching protein sequences:  50%|█████     | 5/10 [00:10<00:10,  2.01s/it]

unmatched site type: other


Fetching protein sequences:  60%|██████    | 6/10 [00:12<00:08,  2.02s/it]

unmatched site type: other


Fetching protein sequences:  70%|███████   | 7/10 [00:14<00:05,  1.96s/it]

unmatched site type: other


Fetching protein sequences:  80%|████████  | 8/10 [00:16<00:03,  1.89s/it]

unmatched site type: other


Fetching protein sequences:  90%|█████████ | 9/10 [00:18<00:01,  1.85s/it]

unmatched site type: other


Fetching protein sequences: 100%|██████████| 10/10 [00:20<00:00,  2.02s/it]


In [27]:
print(blast_results[4])

[4mProteinInfo[0m
├── [94mid[0m = proteininfo4
├── [94msource_id[0m = 1FBA_A
├── [94mname[0m = Chain A, FRUCTOSE 1,6-BISPHOSPHATE ALDOLASE
├── [94msequence[0m = XTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYRQLLFSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSEDEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQSQRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSAKKNTPEEIALATVQALRRTVPAAVTGVTFLSGGQSEEEATVNLSAINNVPLIRPWALTFSYGRALQASVLRAWAGKKENIAAGQNELLKRAKANGDAAQGKYVAGSAGAGSGSLFVANHAY
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = organism13
│       ├── [94mname[0m = Drosophila melanogaster
│       ├── [94mtaxonomy_id[0m = taxon:7227
│       ├── [94mdomain[0m = Eukaryota
│       ├── [94mphylum[0m = Arthropoda
│       ├── [94mtax_class[0m = Insecta
│       ├── [94morder[0m = Diptera
│       ├── [94mfamily[0m = Drosophilidae
│       └── [94mgenus[0m = Drosophila
├── [94mregions[0m
│   └── 0
│       └──

In [2]:
aldolase.get_dna_seq()

NameError: name 'aldolase' is not defined

## Storing `ProteinSequence`s in a database



In [1]:
from sdrdm_database import DBConnector

### Setting up a local MySQL database

First, a local MySQL database needs to be setup. Therefore, we run a docker container with a MySQL database. 
If docker is not installed on your system, please follow the instructions on the [docker website](https://docs.docker.com/get-docker/).


In case this notebook is run on a macOS system with a M1 chip, the following command needs to be run in the terminal first:

>```bash
>export DOCKER_DEFAULT_PLATFORM=linux/amd64
>```

Next, navigate to the directory where this notebook is located and run the following command to start the docker container:

>```bash
>docker compose up -d
>```

### Create tables for `ProteinSequence` and `NucleotideSequence`

In [7]:
!sdrdm-db --root-obj ProteinSequence --model-path https://github.com/PyEED/pyeed.git --env-file env.toml

🎉 Connected                                                                                        

🚀 Creating tables for data model ProteinSequence
│
├── Fetching markdown model from GitHub
├── Table __model_meta__ not existing. Adding to DB!
├── Added table model 'ProteinSequence' to __model_meta__ table
├── Model 'ProteinSequence' already registered. Skipping.
├── Created table 'ProteinSequence'
├── Added table model 'ProteinSequence_coding_sequence' to __model_meta__ table
├── Created table 'ProteinSequence_coding_sequence'
├── Added table model 'NucleotideSequence_regions' to __model_meta__ table
├── Created table 'NucleotideSequence_regions'
├── Added table model 'ProteinSequence_sites' to __model_meta__ table
├── Created table 'ProteinSequence_sites'
├── Created table 'Site_positions'
├── Added table model 'ProteinSequence_regions' to __model_meta__ table
├── Created table 'ProteinSequence_regions'
├── Added table model 'ProteinSequence_organism' to __model_meta__ table
├── Cre

In [8]:
import toml

# Establish a connection to the database
db = DBConnector(**toml.load(open("./env.toml")))

🎉 Connected                                                                                        


In [9]:
# Get an overview over the created tables
db.connection.tables

Tables
------
- NucleotideSequence_regions
- ProteinSequence
- ProteinSequence_coding_sequence
- ProteinSequence_organism
- ProteinSequence_regions
- ProteinSequence_sites
- Site_positions
- __model_meta__

### Populate the database with `ProteinSequence`s

In [10]:
# Insert all blast results into the database
db.insert(*blast_results, verbose=True)

Added dataset ProteinSequence (ce49f3d4-03a5-4943-a206-62bc225088f2)
Added dataset ProteinSequence (17c7fa21-18a6-4d61-918a-2ad441863bd8)
Added dataset ProteinSequence (68ac0244-7402-4c3a-a532-1e62e97f64de)
Added dataset ProteinSequence (4075bec0-4faa-4386-9572-64a47ee463dc)
Added dataset ProteinSequence (3254d412-9983-4ffe-abc6-4e1c93cd3047)
Added dataset ProteinSequence (56fd0978-0d54-4392-baa9-abe17856d280)
Added dataset ProteinSequence (158ad489-eef8-46fb-9a72-06eeef3f3034)
Added dataset ProteinSequence (24d03eb8-1645-491b-9278-0ce1c6c09fe2)
Added dataset ProteinSequence (292a1ef2-bc13-4488-aed5-d5ab364e8fe5)
Added dataset ProteinSequence (0eb2870a-460e-4940-8b75-8a30e58037e9)


### Look at entries in the database

In [11]:
db.connection.table("ProteinSequence")

In [13]:
# Lets filter the blast results for a specific organism
target = "Drosophila melanogaster"

# First, join the ProteinSequence table with the ProteinSequence_organism table
prot_seqs = db.connection.table("ProteinSequence")
organisms = db.connection.table("ProteinSequence_organism")
joined = prot_seqs.join(
    organisms,
    prot_seqs.ProteinSequence_id == organisms.ProteinSequence_id,
    rname="organism_{name}",
)

# Next, filter the joined table for the target organism
filtered = joined.filter(joined.organism_name == target)
filtered

# Finally, we can get the corresponding ProteinSequence objects
results = db.get("ProteinSequence", filtered)
print(results[0])

[4mProteinSequence[0m
├── [94mid[0m = 3254d412-9983-4ffe-abc6-4e1c93cd3047
├── [94mname[0m = Chain A, FRUCTOSE 1,6-BISPHOSPHATE ALDOLASE
├── [94msequence[0m = XTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYRQLLFSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSEDEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQSQRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSAKKNTPEEIALATVQALRRTVPAAVTGVTFLSGGQSEEEATVNLSAINNVPLIRPWALTFSYGRALQASVLRAWAGKKENIAAGQNELLKRAKANGDAAQGKYVAGSAGAGSGSLFVANHAY
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = 6ead49b7-28c7-4c45-b85b-967a7e31a307
│       ├── [94mname[0m = Drosophila melanogaster
│       └── [94mtaxonomy_id[0m = 7227
├── [94mregions[0m
│   └── 0
│       └── [4mRegion[0m
│           ├── [94mid[0m = ae562021-856a-47bc-a59c-65b151090404
│           ├── [94mstart[0m = 14
│           ├── [94mend[0m = 361
│           ├── [94mnote[0m = Fructose-bisphosphate aldolase class-I; p