# Example 1: Get protein information, run BLAST, and dump the results into a database


In [7]:
%reload_ext autoreload
%autoreload 2

from pyEED.core.proteinsequence import ProteinSequence
from pyEED.ncbi.utils import get_nucleotide_sequences

## Query NCBI

The pyEED library is centered around the `ProteinSequence` object, which integrates available information on protein sequence, corresponding nucleotide sequence, as well as regions and sites within the sequences. The `ProteinSequence` can be initialized directly with a protein sequence accession number.

In [42]:
aldolase = ProteinSequence.from_ncbi("NP_001287541.1")
print(aldolase)

[4mProteinSequence[0m
├── [94mid[0m = NP_001287541.1
├── [94mname[0m = aldolase 1, isoform M
├── [94msequence[0m = MTTYFNYPSKELQDELREIAQKIVAPGKGILAADESGPTMGKRLQDIGVENTEDNRRAYRQLLFSTDPKLAENISGVILFHETLYQKADDGTPFAEILKKKGIILGIKVDKGVVPLFGSEDEVTTQGLDDLAARCAQYKKDGCDFAKWRCVLKIGKNTPSYQSILENANVLARYASICQSQRIVPIVEPEVLPDGDHDLDRAQKVTETVLAAVYKALSDHHVYLEGTLLKPNMVTAGQSAKKNTPEEIALATVQALRRTVPAAVTGVTFLSGGQSEEEATVNLSAINNVPLIRPWALTFSYGRALQASVLRAWAGKKENIAAGQNELLKRAKANGDAAQGKYVAGSAGAGSGSLFVANHAY
├── [94morganism[0m
│   └── [4mOrganism[0m
│       ├── [94mid[0m = taxon:7227
│       ├── [94mname[0m = Drosophila melanogaster
│       └── [94mtaxonomy_id[0m = 7227
├── [94mregions[0m
│   └── 0
│       └── [4mRegion[0m
│           ├── [94mid[0m = region32
│           ├── [94mstart[0m = 14
│           ├── [94mend[0m = 361
│           ├── [94mnote[0m = Fructose-bisphosphate aldolase class-I; pfam00274
│           ├── [94mname[0m = Glycolytic
│           └── [94mcross_reference[0m = CDD

In [43]:
# Get the corresponding nucleotide sequence
aldolase.get_nucleotide_seq()
print(aldolase.coding_sequence)

[4mNucleotideSequence[0m
├── [94mid[0m = NM_001300612.1
├── [94mregions[0m
│   └── 0
│       └── [4mRegion[0m
│           ├── [94mid[0m = NM_001300612.1
│           ├── [94mstart[0m = 278
│           └── [94mend[0m = 1363
├── [94mmolecule_type[0m = mRNA
├── [94mgene_id[0m = Ald1
└── [94msequence[0m = ATGACGACCTACTTCAACTACCCCAGCAAGGAGCTGCAGGATGAGCTGCGCGAAATCGCCCAGAAAATCGTTGCCCCCGGCAAGGGAATCCTCGCCGCCGATGAGTCCGGCCCAACCATGGGCAAGCGTCTGCAGGACATCGGCGTGGAGAACACCGAGGACAACCGCCGTGCCTACCGTCAGCTGTTGTTCAGCACTGACCCCAAGCTGGCCGAGAACATCTCTGGAGTGATCCTGTTCCACGAGACCCTCTACCAGAAGGCCGATGATGGCACCCCCTTCGCCGAGATCCTGAAGAAGAAGGGAATCATTCTGGGCATCAAGGTCGACAAGGGTGTTGTCCCACTGTTCGGCTCTGAGGATGAGGTCACCACCCAGGGTCTGGATGACCTGGCCGCCCGTTGCGCCCAGTACAAGAAGGACGGTTGCGACTTCGCCAAGTGGCGTTGCGTCCTGAAGATCGGCAAGAACACCCCATCCTACCAGTCGATCCTGGAGAACGCCAATGTCCTGGCCCGCTACGCCTCCATCTGCCAGTCGCAGCGCATCGTCCCAATTGTGGAGCCCGAGGTTCTGCCCGATGGCGATCACGATCTGGACCGCGCCCAGAAGGTCACCGAGACCGTCCTGGCCGCCGTCTACAAGGCCCTGAGCGACCACCACGTCTACCTGGAGGGT

## BLAST search

In [44]:
blast_results = aldolase.pblast(n_hits=10)

Running blast search for aldolase 1, isoform M from Drosophila melanogaster...


Fetching protein sequences: 100%|██████████| 10/10 [00:19<00:00,  1.90s/it]


In [45]:
get_nucleotide_sequences(blast_results)

Fetching nucleotide sequences: 100%|██████████| 10/10 [00:09<00:00,  1.10it/s]


## Storing `ProteinSequence`s in a database



In [46]:
from sdrdm_database import DBConnector, create_tables

### Setting up a local MySQL database

First, a local MySQL database needs to be setup. Therefore, we run a docker container with a MySQL database. 

>[!NOTE]
>
>If docker is not isntalled on your system, please follow the instructions on the [docker website](https://docs.docker.com/get-docker/).


In case this notebook is run on a macOS system with a M1 chip, the following command needs to be run in the terminal first:

>```bash
>export DOCKER_DEFAULT_PLATFORM=linux/amd64
>```

Next, navigate to the directory where this notebook is located and run the following command to start the docker container:

>```bash
>docker compose up -d
>```

### Connect to the database

In [47]:
# Establish a connection to the database
db = DBConnector(
    username="root",
    password="root",
    host="localhost",
    db_name="db",
    port=3306,
    dbtype="mysql",
)

### Create tables for `NucleotideSequence`

Currently, the database functionalities are under construction. Therefore, a simplified example is shown here. Instead of dumping all information of a `ProteinSequence` into the database, only the `NucleotideSequence` object is dumped into the database.

In [49]:
from pyEED.core.nucleotidesequence import NucleotideSequence

create_tables(db_connector=db, model=NucleotideSequence)

In [50]:
# Get an overview over the created tables
db.connection.tables

Tables
------
- NucleotideSequence
- regions

### Populate the database with `NucleotideSequence`s

In [51]:
# Extract only the coding sequences for all blasted protein sequences
coding_sequences = [
    sequence.coding_sequence
    for sequence in blast_results
    if sequence.coding_sequence is not None
]

In [52]:
# Insert all coding sequences into the database
db.insert(*coding_sequences, verbose=True)

Added dataset NucleotideSequence (24801bc5-62fd-432d-b5dd-c314d563be05)
Added dataset NucleotideSequence (12cee352-ee0f-468d-9666-8a34bda7d60b)
Added dataset NucleotideSequence (57c4f81d-37e4-48f7-b897-36a0b37f2a37)
Added dataset NucleotideSequence (5a8dd3e6-03ea-43b0-b61c-27dd4bcf2e49)
Added dataset NucleotideSequence (2b8a2f46-b390-4f1e-b8a7-a13ad7c0fc1a)
Added dataset NucleotideSequence (a061941e-0b82-466e-bb6c-ebb753b0b7ae)
Added dataset NucleotideSequence (13f8502f-e7f8-4893-8a22-e7cbafe9f5b5)
Added dataset NucleotideSequence (f913ab86-adda-49fb-ae8f-f0f1423757d7)
Added dataset NucleotideSequence (7c052c82-6b62-40f1-ab89-7b34f93683dd)


### Look at entries in the database

In [53]:
db.connection.table("NucleotideSequence")

In [54]:
db.connection.table("regions")