# Example 1: Get protein information, run BLAST, and dump the results into a database


In [4]:
from sdRDM.generator import generate_python_api

generate_python_api("../../specifications/data_model.md", "../../", "pyEED")

In [5]:
%reload_ext autoreload
%autoreload 2
from pyEED.core import ProteinInfo

## Query NCBI

The pyEED library is centered around the `ProteinSequence` object, which integrates available information on protein sequence, corresponding nucleotide sequence, as well as regions and sites within the sequences. The `ProteinSequence` can be initialized directly with a protein sequence accession number.

In [6]:
aldolase = ProteinInfo.from_ncbi("NP_001287541.1")
print(aldolase)

[4mProteinInfo[0m
├── [94mid[0m = proteininfo0
├── [94msource_id[0m = NP_001287541.1
├── [94mname[0m = aldolase 1, isoform M
├── [94msequence[0m = MTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYRQLLFSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSEDEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQSQRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSAKKNTPEEIALATVQALRRTVPAAVTGVTFLSGGQSEEEATVNLSAINNVPLIRPWALTFSYGRALQASVLRAWAGKKENIAAGQNELLKRAKANGDAAQGKYVAGSAGAGSGSLFVANHAY
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = organism0
│       ├── [94mname[0m = Drosophila melanogaster
│       ├── [94mtaxonomy_id[0m = taxon:7227
│       ├── [94mdomain[0m = Eukaryota
│       ├── [94mkingdom[0m = Metazoa
│       ├── [94mphylum[0m = Arthropoda
│       ├── [94mtax_class[0m = Insecta
│       ├── [94morder[0m = Diptera
│       ├── [94mfamily[0m = Drosophilidae
│       ├── [94mgenus[0m = Drosophila
│       └── [94ms

## BLAST search

In [7]:
blast_results = aldolase.pblast(n_hits=10, e_value=1e-50)
blast_results.append(aldolase)

🏃🏼‍♀️ Running PBLAST
╭── protein name: aldolase 1, isoform M
├── accession: NP_001287541.1
├── organism: Drosophila melanogaster
├── e-value: 1e-50
╰── max hits: 10


⬇️ Fetching protein sequences: 100%|██████████| 10/10 [00:00<00:00, 58.49it/s]

🎉 Done






## Storing `ProteinSequence`s in a PostgreSQL database



In [8]:
from sdrdm_database import DBConnector

### Setting up a local MySQL database

First, a local MySQL database needs to be setup. Therefore, we run a docker container with a MySQL database. 
If docker is not installed on your system, please follow the instructions on the [docker website](https://docs.docker.com/get-docker/).


In case this notebook is run on a macOS system with a M1 chip, the following command needs to be run in the terminal first:

>```bash
>export DOCKER_DEFAULT_PLATFORM=linux/amd64
>```

Next, navigate to the directory where this notebook is located and run the following command to start the docker container:

>```bash
>docker compose up -d
>```

### Delete contianers

>```    
>docker rm -vf $(docker ps -aq)
>docker rmi -f $(docker images -aq)
>```

### Connect to the PostgreSQL database

In [9]:
import toml

# Establish a connection to the database
db = DBConnector(**toml.load(open("./env.toml")))

🎉 Connected                                                                                        


### Create tables for `ProteinInfo`

In [10]:
db.create_tables(
    model=ProteinInfo,
    markdown_path="/Users/max/Documents/GitHub/pyeed/specifications/data_model.md",
)


🚀 Creating tables for data model ProteinInfo
│
├── Model 'ProteinInfo' already registered. Skipping.
├── Table 'ProteinInfo'. Already exists in database. Skipping.
├── Table 'ProteinInfo_citation'. Already exists in database. Skipping.
├── Table 'Citation_authors'. Already exists in database. Skipping.
├── Table 'ProteinInfo_substrates'. Already exists in database. Skipping.
├── Table 'ProteinInfo_coding_sequence_ref'. Already exists in database. Skipping.
├── Table 'DNARegion_spans'. Already exists in database. Skipping.
├── Table 'ProteinInfo_sites'. Already exists in database. Skipping.
├── Table 'Site_positions'. Already exists in database. Skipping.
├── Table 'ProteinInfo_regions'. Already exists in database. Skipping.
├── Table 'ProteinRegion_spans'. Already exists in database. Skipping.
├── Table 'ProteinInfo_organism'. Already exists in database. Skipping.
│
╰── 🎉 Created all tables for data model ProteinInfo



In [11]:
# See all created table names
db.connection.list_tables()

['Citation_authors',
 'DNARegion_spans',
 'ProteinInfo',
 'ProteinInfo_citation',
 'ProteinInfo_coding_sequence_ref',
 'ProteinInfo_organism',
 'ProteinInfo_regions',
 'ProteinInfo_sites',
 'ProteinInfo_substrates',
 'ProteinRegion_spans',
 'Site_positions',
 '__model_meta__']

### Populate the database with `ProteinSequence`s

In [12]:
# Insert all blast results into the database
db.insert(*blast_results, verbose=True)

Added dataset ProteinInfo (cd511ca4-357b-4de5-99ec-77f44dfeadff)
Added dataset ProteinInfo (3e8010cd-fb1a-4666-a43a-cecc142dde3a)
Added dataset ProteinInfo (1a7fa8ca-a5f7-43d9-afa7-6d4cab3a83a8)
Added dataset ProteinInfo (d1532780-8014-4bbf-b451-b2168e42edf3)
Added dataset ProteinInfo (a8f63321-84c4-40ff-949a-239b27d35ddb)
Added dataset ProteinInfo (6a3314a6-dab0-41d7-8950-ed5af0c19713)
Added dataset ProteinInfo (d498c7e8-4ba6-4640-bdda-3b2472735142)
Added dataset ProteinInfo (36f34eb9-3e6a-4a70-833d-6dbf127a8cdd)
Added dataset ProteinInfo (a61f987e-b782-4781-86a1-1e0d120c1494)
Added dataset ProteinInfo (895c995d-966e-4014-8bd4-dc2294837407)
Added dataset ProteinInfo (d6f3050b-72d1-481f-a08d-c9c01694ec7b)


### Look at entries in the database

In [16]:
db.connection.table("ProteinInfo")

In [14]:
# Lets filter the blast results for a specific organism
target = "Drosophila melanogaster"

# First, join the ProteinSequence table with the ProteinSequence_organism table
prot_seqs = db.connection.table("ProteinInfo")
organisms = db.connection.table("ProteinInfo_organism")
joined = prot_seqs.join(
    organisms,
    prot_seqs.ProteinInfo_id == organisms.ProteinInfo_id,
    rname="organism_{name}",
)

# Next, filter the joined table for the target organism
filtered = joined.filter(joined.organism_name == target)
filtered

# Finally, we can get the corresponding ProteinSequence objects
results = db.get("ProteinInfo", filtered)
print(len(results))

9
