# Example 1: Get protein information, run BLAST, and dump the results into a database


In [1]:
%reload_ext autoreload
%autoreload 2
from pyEED.core import ProteinInfo

## Query NCBI

The pyEED library is centered around the `ProteinSequence` object, which integrates available information on protein sequence, corresponding nucleotide sequence, as well as regions and sites within the sequences. The `ProteinSequence` can be initialized directly with a protein sequence accession number.

In [2]:
aldolase = ProteinInfo.from_ncbi("NP_001287541.1")
print(aldolase)

[4mProteinInfo[0m
├── [94mid[0m = proteininfo0
├── [94msource_id[0m = NP_001287541.1
├── [94mname[0m = aldolase 1, isoform M
├── [94msequence[0m = MTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYRQLLFSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSEDEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQSQRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSAKKNTPEEIALATVQALRRTVPAAVTGVTFLSGGQSEEEATVNLSAINNVPLIRPWALTFSYGRALQASVLRAWAGKKENIAAGQNELLKRAKANGDAAQGKYVAGSAGAGSGSLFVANHAY
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = organism0
│       ├── [94mname[0m = Drosophila melanogaster
│       ├── [94mtaxonomy_id[0m = taxon:7227
│       ├── [94mdomain[0m = Eukaryota
│       ├── [94mkingdom[0m = Metazoa
│       ├── [94mphylum[0m = Arthropoda
│       ├── [94mtax_class[0m = Insecta
│       ├── [94morder[0m = Diptera
│       ├── [94mfamily[0m = Drosophilidae
│       ├── [94mgenus[0m = Drosophila
│       └── [94ms

## BLAST search

In [3]:
blast_results = aldolase.pblast(n_hits=10, e_value=1e-50)
blast_results.append(aldolase)

🏃🏼‍♀️ Running PBLAST
╭── protein name: aldolase 1, isoform M
├── accession: NP_001287541.1
├── organism: Drosophila melanogaster
├── e-value: 1e-50
╰── max hits: 10


⬇️ Fetching protein sequences: 100%|██████████| 10/10 [00:00<00:00, 52.96it/s]

🎉 Done






## Storing `ProteinSequence`s in a PostgreSQL database



In [4]:
from sdrdm_database import DBConnector

### Setting up a local MySQL database

First, a local MySQL database needs to be setup. Therefore, we run a docker container with a MySQL database. 
If docker is not installed on your system, please follow the instructions on the [docker website](https://docs.docker.com/get-docker/).


In case this notebook is run on a macOS system with a M1 chip, the following command needs to be run in the terminal first:

>```bash
>export DOCKER_DEFAULT_PLATFORM=linux/amd64
>```

Next, navigate to the directory where this notebook is located and run the following command to start the docker container:

>```bash
>docker compose up -d
>```

### Delete contianers

>```    
>docker rm -vf $(docker ps -aq)
>docker rmi -f $(docker images -aq)
>```

### Connect to the PostgreSQL database

In [5]:
import toml

# Establish a connection to the database
db = DBConnector(**toml.load(open("./env.toml")))

🎉 Connected                                                                                        


### Create tables for `ProteinInfo`

In [6]:
db.create_tables(
    model=ProteinInfo,
    markdown_path="/Users/max/Documents/GitHub/pyeed/specifications/data_model.md",
)


🚀 Creating tables for data model ProteinInfo
│
├── Table __model_meta__ not existing. Adding to DB!
├── Added table model 'ProteinInfo' to __model_meta__ table
├── Model 'ProteinInfo' already registered. Skipping.
├── Created table 'ProteinInfo'
├── Added table model 'ProteinInfo_coding_sequence_ref' to __model_meta__ table
├── Created table 'ProteinInfo_coding_sequence_ref'
├── Added table model 'DNARegion_spans' to __model_meta__ table
├── Created table 'DNARegion_spans'
├── Added table model 'ProteinInfo_sites' to __model_meta__ table
├── Created table 'ProteinInfo_sites'
├── Created table 'Site_positions'
├── Added table model 'ProteinInfo_regions' to __model_meta__ table
├── Created table 'ProteinInfo_regions'
├── Added table model 'ProteinRegion_spans' to __model_meta__ table
├── Created table 'ProteinRegion_spans'
├── Added table model 'ProteinInfo_organism' to __model_meta__ table
├── Created table 'ProteinInfo_organism'
├── Added primary key 'ProteinInfo_id' to table ProteinI

In [7]:
# See all created table names
db.connection.list_tables()

['DNARegion_spans',
 'ProteinInfo',
 'ProteinInfo_coding_sequence_ref',
 'ProteinInfo_organism',
 'ProteinInfo_regions',
 'ProteinInfo_sites',
 'ProteinRegion_spans',
 'Site_positions',
 '__model_meta__']

### Populate the database with `ProteinSequence`s

In [8]:
# Insert all blast results into the database
db.insert(*blast_results, verbose=True)

Added dataset ProteinInfo (4f426ec3-20b9-4507-b932-b08b56931bea)
Added dataset ProteinInfo (f279b756-1be7-4052-97a0-0b96e33c03d1)
Added dataset ProteinInfo (23f7184f-22e5-4c05-aba1-d078fdd489f9)
Added dataset ProteinInfo (d2eba4c6-e6ef-41d6-a0b7-263f3527988f)
Added dataset ProteinInfo (6581d172-39b2-488a-8930-661aa23c695c)
Added dataset ProteinInfo (49a9524c-ee3b-411c-b461-510739ce8f4d)
Added dataset ProteinInfo (d9c6aed7-d4b4-4a8c-b07d-b103e12e0597)
Added dataset ProteinInfo (cf180852-232f-47fd-8bb8-36de45243afc)
Added dataset ProteinInfo (e20ad027-116a-49c5-a868-9ffb4d4e563d)
Added dataset ProteinInfo (123dbc69-5043-46e6-a8ec-57780d6cfc5d)
Added dataset ProteinInfo (a931df8e-e3b0-441c-bec7-7117e801025a)


### Look at entries in the database

In [9]:
db.connection.table("ProteinInfo_organism")

In [10]:
# Lets filter the blast results for a specific organism
target = "Drosophila melanogaster"

# First, join the ProteinSequence table with the ProteinSequence_organism table
prot_seqs = db.connection.table("ProteinInfo")
organisms = db.connection.table("ProteinInfo_organism")
joined = prot_seqs.join(
    organisms,
    prot_seqs.ProteinInfo_id == organisms.ProteinInfo_id,
    rname="organism_{name}",
)

# Next, filter the joined table for the target organism
filtered = joined.filter(joined.organism_name == target)
filtered

# Finally, we can get the corresponding ProteinSequence objects
results = db.get("ProteinInfo", filtered)
print(len(results))

5
