# Example 1: Get protein information, run BLAST, and dump the results into a database


In [1]:
from sdRDM.generator import generate_python_api

generate_python_api("../../specifications/data_model.md", "../../", "pyEED")

In [2]:
%reload_ext autoreload
%autoreload 2
from pyEED.core import ProteinInfo

## Query NCBI

The pyEED library is centered around the `ProteinSequence` object, which integrates available information on protein sequence, corresponding nucleotide sequence, as well as regions and sites within the sequences. The `ProteinSequence` can be initialized directly with a protein sequence accession number.

In [3]:
aldolase = ProteinInfo.from_ncbi("NP_001287541.1")
print(aldolase)

[4mProteinInfo[0m
├── [94mid[0m = proteininfo0
├── [94msource_id[0m = NP_001287541.1
├── [94mname[0m = aldolase 1, isoform M
├── [94msequence[0m = MTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYRQLLFSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSEDEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQSQRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSAKKNTPEEIALATVQALRRTVPAAVTGVTFLSGGQSEEEATVNLSAINNVPLIRPWALTFSYGRALQASVLRAWAGKKENIAAGQNELLKRAKANGDAAQGKYVAGSAGAGSGSLFVANHAY
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = organism0
│       ├── [94mname[0m = Drosophila melanogaster
│       ├── [94mtaxonomy_id[0m = taxon:7227
│       ├── [94mdomain[0m = Eukaryota
│       ├── [94mkingdom[0m = Metazoa
│       ├── [94mphylum[0m = Arthropoda
│       ├── [94mtax_class[0m = Insecta
│       ├── [94morder[0m = Diptera
│       ├── [94mfamily[0m = Drosophilidae
│       ├── [94mgenus[0m = Drosophila
│       └── [94ms

## BLAST search

In [4]:
blast_results = aldolase.pblast(n_hits=10, e_value=1e-50)
blast_results.append(aldolase)

🏃🏼‍♀️ Running PBLAST
╭── protein name: aldolase 1, isoform M
├── accession: NP_001287541.1
├── organism: Drosophila melanogaster
├── e-value: 1e-50
╰── max hits: 10


⬇️ Fetching protein sequences: 100%|██████████| 10/10 [00:00<00:00, 57.62it/s]

🎉 Done






## Storing `ProteinSequence`s in a PostgreSQL database



In [5]:
from sdrdm_database import DBConnector

### Setting up a local MySQL database

First, a local MySQL database needs to be setup. Therefore, we run a docker container with a MySQL database. 
If docker is not installed on your system, please follow the instructions on the [docker website](https://docs.docker.com/get-docker/).


In case this notebook is run on a macOS system with a M1 chip, the following command needs to be run in the terminal first:

>```bash
>export DOCKER_DEFAULT_PLATFORM=linux/amd64
>```

Next, navigate to the directory where this notebook is located and run the following command to start the docker container:

>```bash
>docker compose up -d
>```

### Delete contianers

>```    
>docker rm -vf $(docker ps -aq)
>docker rmi -f $(docker images -aq)
>```

### Connect to the PostgreSQL database

In [6]:
import toml

# Establish a connection to the database
db = DBConnector(**toml.load(open("./env.toml")))

🎉 Connected                                                                                        


### Create tables for `ProteinInfo`

In [7]:
db.create_tables(
    model=ProteinInfo,
    markdown_path="/Users/max/Documents/GitHub/pyeed/specifications/data_model.md",
)


🚀 Creating tables for data model ProteinInfo
│
├── Table __model_meta__ not existing. Adding to DB!
├── Added table model 'ProteinInfo' to __model_meta__ table
├── Model 'ProteinInfo' already registered. Skipping.
├── Created table 'ProteinInfo'
├── Added table model 'ProteinInfo_citation' to __model_meta__ table
├── Created table 'ProteinInfo_citation'
├── Added table model 'Citation_authors' to __model_meta__ table
├── Created table 'Citation_authors'
├── Added table model 'ProteinInfo_substrates' to __model_meta__ table
├── Created table 'ProteinInfo_substrates'
├── Added table model 'ProteinInfo_coding_sequence_ref' to __model_meta__ table
├── Created table 'ProteinInfo_coding_sequence_ref'
├── Added table model 'DNARegion_spans' to __model_meta__ table
├── Created table 'DNARegion_spans'
├── Added table model 'ProteinInfo_sites' to __model_meta__ table
├── Created table 'ProteinInfo_sites'
├── Created table 'Site_positions'
├── Added table model 'ProteinInfo_regions' to __model_m

In [8]:
# See all created table names
db.connection.list_tables()

['Citation_authors',
 'DNARegion_spans',
 'ProteinInfo',
 'ProteinInfo_citation',
 'ProteinInfo_coding_sequence_ref',
 'ProteinInfo_organism',
 'ProteinInfo_regions',
 'ProteinInfo_sites',
 'ProteinInfo_substrates',
 'ProteinRegion_spans',
 'Site_positions',
 '__model_meta__']

### Populate the database with `ProteinSequence`s

In [9]:
# Insert all blast results into the database
db.insert(*blast_results, verbose=True)

Added dataset ProteinInfo (f788c966-2df5-4393-a1f0-3dfee82fde20)
Added dataset ProteinInfo (e35241aa-bc18-4acd-b93e-9962ecb2ad32)
Added dataset ProteinInfo (3e9048bc-fac4-4ea1-8084-f592d1b8003a)
Added dataset ProteinInfo (6bfb0751-4c7c-4d6f-ac8f-6dffaad32efa)
Added dataset ProteinInfo (74fd1ced-4901-41c2-8199-f0e6b2802c17)
Added dataset ProteinInfo (04158b95-54d2-4cee-b726-0d551f4057c4)
Added dataset ProteinInfo (b865b87d-0ef9-4567-af9b-6c32817abbf1)
Added dataset ProteinInfo (1133dd16-9e23-4591-9234-3f497b844f03)
Added dataset ProteinInfo (e9251cb1-2281-407e-9664-8662c95ce449)
Added dataset ProteinInfo (0a59c085-feb5-405f-b3d4-8d1447bc2f5a)
Added dataset ProteinInfo (ff58b48a-61a0-4386-96c6-967d32b8fa21)


### Look at entries in the database

In [10]:
db.connection.table("ProteinInfo")

In [11]:
# Lets filter the blast results for a specific organism
target = "Drosophila melanogaster"

# First, join the ProteinSequence table with the ProteinSequence_organism table
prot_seqs = db.connection.table("ProteinInfo")
organisms = db.connection.table("ProteinInfo_organism")
joined = prot_seqs.join(
    organisms,
    prot_seqs.ProteinInfo_id == organisms.ProteinInfo_id,
    rname="organism_{name}",
)

# Next, filter the joined table for the target organism
filtered = joined.filter(joined.organism_name == target)
filtered

# Finally, we can get the corresponding ProteinSequence objects
results = db.get("ProteinInfo", filtered)
print(len(results))

5
