In [17]:
%reload_ext autoreload
%autoreload 2
import logging

from pyeed import Pyeed
from pyeed.model import GOAnnotation, Protein

In [18]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
LOGGER = logging.getLogger(__name__)

In [19]:
LOGGER.info("Setting up test")

uri = "bolt://127.0.0.1:7687"
user = "neo4j"
password = "12345678900"

# Create a Pyeed object, automatically connecting to the database
eedb = Pyeed(uri, user, password)

# For testing purposes, we will wipe the database and remove all constraints
eedb.db._wipe_database()
eedb.db._remove_db_constraints(user, password)

# DB connector is an attribute of the Pyeed object, type `DatabaseConnector`
LOGGER.info(f"Database stats: {eedb.db.stats()}")

# The first time the pyeed database is initialized, we need to create the constraints which are defined in the pyeed graph model
eedb.db._initialize_db_constraints(user=user, password=password)

2024-10-19 16:32:46,616 - INFO - Setting up test


📡 Connected to database.
All data has been wiped from the database.
Connecting to bolt://neo4j:12345678900@127.0.0.1:7687
Dropping constraints...
 - Dropping unique constraint and index on label DNA with property accession_id.
 - Dropping unique constraint and index on label GOAnnotation with property go_id.
 - Dropping unique constraint and index on label Organism with property taxonomy_id.
 - Dropping unique constraint and index on label Protein with property accession_id.
 - Dropping unique constraint and index on label Region with property region_id.
 - Dropping unique constraint and index on label Site with property site_id.

Dropping indexes...
 - Dropping index on labels DNA with properties embedding.
 - Dropping index on labels Protein with properties embedding.

All constraints and indexes have been removed from the database.


2024-10-19 16:32:47,625 - INFO - Database stats: {'nodes': 0, 'relationships': 0}


Loaded /home/niklas/Desktop/Job_Niklas/pyeed/src/pyeed/model.py
Connecting to bolt://neo4j:12345678900@127.0.0.1:7687
Setting up indexes and constraints...

Found model.StrictStructuredNode
 ! Skipping class model.StrictStructuredNode is abstract
Found model.Organism
 + Creating node unique constraint for taxonomy_id on label Organism for class model.Organism
Found model.Site
 + Creating node unique constraint for site_id on label Site for class model.Site
Found model.Region
 + Creating node unique constraint for region_id on label Region for class model.Region
Found model.GOAnnotation
 + Creating node unique constraint for go_id on label GOAnnotation for class model.GOAnnotation
Found model.Protein
 + Creating node unique constraint for accession_id on label Protein for class model.Protein
 + Creating vector index for embedding on label Protein for class model.Protein
Found model.DNA
 + Creating node unique constraint for accession_id on label DNA for class model.DNA
 + Creating vecto

In [20]:
# ok we are ready to go
LOGGER.info("Setup complete")

# read in the ids.json file form this directory
import json

with open("TEM_Ids.json", "r") as f:
    dict_id_name = json.load(f)

# now fecth all of the proteins from the database
eedb.fetch_from_primary_db(dict_id_name, db='NCBI')

2024-10-19 16:32:48,394 - INFO - Setup complete
[32m2024-10-19 16:32:48.444[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36mmake_request[0m:[36m143[0m - [34m[1mSending 21 requests in batches of 10[0m
[32m2024-10-19 16:32:48.445[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m124[0m - [34m[1mSending request to https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein with parameters: {'retmode': 'text', 'rettype': 'genbank', 'id': 'AAP20891,CAJ85677,SAQ02853,CDR98216,WP_109963600,CAA41038,WP_109874025,CAA46344,APG33178,AKC98298'}[0m
[32m2024-10-19 16:32:48.946[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m124[0m - [34m[1mSending request to https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein with parameters: {'retmode': 'text', 'rettype': 'genbank', 'id': 'KJO56189,KLP91446,CAA46346,CAA74912,AFN21551,ACB22021,CAA7

In [21]:
# ok now lets test the limits size wise
import pandas as pd

df = pd.read_csv('HitTable_AAL29438_BlastP.csv')
# the second columns are the new ids
ids = df.iloc[:, 1].tolist()

# now fecth all of the proteins from the database
# fetch the data in chunks of 50
for i in range(0, len(ids), 50):
    print(f"Fetching {i} to {i+50}")
    eedb.fetch_from_primary_db(ids[i:i+50], db='NCBI')

[32m2024-10-19 16:33:18.446[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36mmake_request[0m:[36m143[0m - [34m[1mSending 5 requests in batches of 10[0m
[32m2024-10-19 16:33:18.447[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m124[0m - [34m[1mSending request to https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein with parameters: {'retmode': 'text', 'rettype': 'genbank', 'id': 'HBQ2613975.1,EKW4005960.1,EJG7116187.1,AMM70781.1,HCO3480053.1,HAI5030310.1,AII99784.1,WP_000027057.1,WP_215748091.1,WP_261627585.1'}[0m


Fetching 0 to 50


[32m2024-10-19 16:33:18.948[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m124[0m - [34m[1mSending request to https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein with parameters: {'retmode': 'text', 'rettype': 'genbank', 'id': 'HDN1137928.1,ARF47333.1,WP_161654968.1,EHC9934517.1,ANG09566.1,ANG13130.1,WP_240078874.1,ELK1047634.1,ANG27598.1,HBC1239896.1'}[0m
2024-10-19 16:33:19,359 - INFO - HTTP Request: GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&retmode=text&rettype=genbank&id=HBQ2613975.1%2CEKW4005960.1%2CEJG7116187.1%2CAMM70781.1%2CHCO3480053.1%2CHAI5030310.1%2CAII99784.1%2CWP_000027057.1%2CWP_215748091.1%2CWP_261627585.1 "HTTP/1.1 200 OK"
[32m2024-10-19 16:33:19.449[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m124[0m - [34m[1mSending request to https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein with paramet

Fetching 50 to 100


[32m2024-10-19 16:35:02.152[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m124[0m - [34m[1mSending request to https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein with parameters: {'retmode': 'text', 'rettype': 'genbank', 'id': 'EAY6970089.1,ANG23579.1,ANG10564.1,WP_198214710.1,HCP1710245.1,EDC4633317.1,ANG23305.1,ANG19419.1,ANG14490.1,WP_063864904.1'}[0m
[32m2024-10-19 16:35:02.654[0m | [34m[1mDEBUG   [0m | [36mpyeed.adapter.primary_db_adapter[0m:[36msend_request[0m:[36m124[0m - [34m[1mSending request to https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein with parameters: {'retmode': 'text', 'rettype': 'genbank', 'id': 'HCP3616637.1,WP_063864897.1,ANG22525.1,ANG20310.1,ANG09695.1,ANG09509.1,ANG24407.1,EDN1638614.1,MDV1392406.1,WP_153933068.1'}[0m
2024-10-19 16:35:02,733 - INFO - HTTP Request: GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&retmode=text&retty

KeyboardInterrupt: 

In [None]:
eedb.fetchRemoteCodingSequences()

[32m2024-10-19 16:40:12.407[0m | [1mINFO    [0m | [36mpyeed.main[0m:[36mfetchRemoteCodingSequences[0m:[36m234[0m - [1mFetching 246 coding sequences.[0m
[32m2024-10-19 16:40:12.409[0m | [1mINFO    [0m | [36mpyeed.main[0m:[36mfetchRemoteCodingSequences[0m:[36m235[0m - [1mFetching coding sequences: ['AY263331.1', 'BN000925.1', 'DQ679961.1', 'FKZZ01000044.1', 'LK391770.1', 'CP014489.1', 'X57972.1', 'KY432403.1', 'X65252.1', 'DQ909059.1', 'KY271103.1', 'JF949915.1', 'KP870110.1', 'EF534736.1', 'LAAD01000027.1', 'LEDF01000033.1', 'X65254.1', 'EF136376.1', 'Y14574.2', 'MG821356.1', 'JX042489.1', 'EF136377.1', 'EU527189.1', 'KT395278.1', 'Y17582.1', 'EF468463.1', 'Y17583.1', 'FO203354.1', 'X98047.1', 'LDCJ01000052.1', 'MG821377.1', 'EU815939.1', 'U37195.1', 'Y17584.1', 'EU274580.1', 'AJ437107.1', 'KY739237.1', 'AAZBZP010000084.1', 'CXLQ01000061.1', 'MG821378.1', 'FJ197316.1', 'GU371926.1', 'KC844056.1', 'FJ360884.1', 'KP860986.1', 'KY305958.1', 'MH079593.1', 'FJ919776.1'