In [1]:
import pyphylon
import importlib

from pyphylon.blast_utils import *
from pyphylon.util import load_config

In [2]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES = CONFIG['PG_NAME']

# Extract representative alleles
If you wish to compare the rerpesentative alleles for each gene cluster to external  sequences and annotations, you can extract them to a unique file using this command

In [4]:
# function to extract the representative alleles for each gene cluster
extract_reference_sequences(WORKDIR + 'processed/cd-hit-results', SPECIES, WORKDIR + 'processed/cd-hit-results/' + SPECIES + '_representative_sequences')

# Comparing pangenome against blast database of interest

Requires blast to be installed in your environment. If using conda, the command can be installed with `conda install -c bioconda blast`.

Example given for [VFDB](https://www.mgc.ac.cn/VFs/download.htm) for the download of the core dataset protein sequences.

In [3]:
# Download of proteins fasta downloaded and placed into external directory in data outside of this notebook
# This can be done for any fasta of interest
make_blast_db(WORKDIR + 'external/VFDB/VFDB_setA_pro.fas', WORKDIR + 'external/VFDB/VFDB')



Building a new DB, current time: 02/18/2025 17:40:54
New DB name:   /mnt/c/Users/joshb/OneDrive/Desktop/pyphylon_testing/pyphylon/examples/data/external/VFDB/VFDB
New DB title:  data/external/VFDB/VFDB_setA_pro.fas
Sequence type: Protein
Deleted existing Protein BLAST database named /mnt/c/Users/joshb/OneDrive/Desktop/pyphylon_testing/pyphylon/examples/data/external/VFDB/VFDB
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 4261 sequences in 1.14859 seconds.
Finished running, database created at data/external/VFDB/VFDB


In [5]:
blast_localdb_enrichment(WORKDIR + 'external/VFDB/VFDB', WORKDIR + 'processed/cd-hit-results/' + SPECIES + '_representative_sequences', 
                         WORKDIR + 'external/VFDB/results.txt', e_val = 1e-5)

Command:  blastp -query data/processed/cd-hit-results/SPyogenes_representative_sequences -out data/external/VFDB/results.txt -db data/external/VFDB/VFDB -outfmt 6 -evalue 1e-05
Completed blast


In [6]:
blast_results = process_blast_results(WORKDIR + 'external/VFDB/results.txt', e_val = 1e-5, percent_identity=80)
blast_results

Unnamed: 0,query,target,identity,len,mismatch,gapopen,qstart,qend,tstart,tend,e_val,bitscore
5331,SPyogenes_C1044A0,VFG005185(gb|WP_002991968),98.251,343,6,0,1,343,1,343,0.000000e+00,694.0
1177,SPyogenes_C1058A0,VFG000961(gb|WP_010922160),83.578,341,54,1,1,341,1,339,0.000000e+00,589.0
3767,SPyogenes_C1069A0,VFG042968(gb|WP_010921811),100.000,340,0,0,1,340,1,340,0.000000e+00,682.0
4065,SPyogenes_C1077A0,VFG000961(gb|WP_010922160),99.112,338,3,0,1,338,1,338,0.000000e+00,699.0
4381,SPyogenes_C107A0,VFG005586(gb|WP_010922241),99.207,757,6,0,1,757,1,757,0.000000e+00,1569.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5154,SPyogenes_C810A2,VFG000976(gb|WP_010921831),99.499,399,2,0,2,400,173,571,0.000000e+00,820.0
5729,SPyogenes_C811A0,VFG000968(gb|WP_010922714),89.686,223,22,1,179,400,262,484,4.170000e-128,375.0
3120,SPyogenes_C825A0,VFG000973(gb|WP_010922720),99.246,398,3,0,1,398,1,398,0.000000e+00,813.0
4656,SPyogenes_C84A15,VFG005586(gb|WP_010922241),98.634,805,11,0,1,805,1,805,0.000000e+00,1659.0


# Make blast database from our pangenome

In [24]:
DATABASE = WORKDIR + 'external/PangenomeDB/PangenomeDB'
INPUT_FILE =  WORKDIR + 'processed/cd-hit-results/' + SPECIES

make_blast_db(INPUT_FILE, DATABASE)



Building a new DB, current time: 02/18/2025 17:54:19
New DB name:   /mnt/c/Users/joshb/OneDrive/Desktop/pyphylon_testing/pyphylon/examples/data/external/PangenomeDB/PangenomeDB
New DB title:  data/processed/cd-hit-results/SPyogenes
Sequence type: Protein
Deleted existing Protein BLAST database named /mnt/c/Users/joshb/OneDrive/Desktop/pyphylon_testing/pyphylon/examples/data/external/PangenomeDB/PangenomeDB
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 18581 sequences in 4.44357 seconds.
Finished running, database created at data/external/PangenomeDB/PangenomeDB


Create a query file of interest with a sequence you hope to blast, example sequence is from https://www.uniprot.org/uniprotkb/A0A4Y6ER29/entry#sequences


In [33]:
QUERY_FILE =  WORKDIR + 'external/PangenomeDB/query.txt'
OUTPUT_FILE = WORKDIR + 'external/PangenomeDB/results.txt'

blast_localdb_enrichment(DATABASE, QUERY_FILE, OUTPUT_FILE, e_val = 1e-5)

Command:  blastp -query data/external/PangenomeDB/query.txt -out data/external/PangenomeDB/results.txt -db data/external/PangenomeDB/PangenomeDB -outfmt 6 -evalue 1e-05
Completed blast


In [34]:
blast_results = process_blast_results(OUTPUT_FILE, e_val = 1e-5, percent_identity=0, unique=False)
blast_results

Unnamed: 0,query,target,identity,len,mismatch,gapopen,qstart,qend,tstart,tend,e_val,bitscore
0,tr|A0A4Y6ER29|A0A4Y6ER29_KLEAE,SPyogenes_C582A41,45.956,272,142,3,1,267,149,420,1.18e-79,246
1,tr|A0A4Y6ER29|A0A4Y6ER29_KLEAE,SPyogenes_C582A0,45.956,272,142,3,1,267,149,420,1.2399999999999999e-79,246
2,tr|A0A4Y6ER29|A0A4Y6ER29_KLEAE,SPyogenes_C582A42,45.221,272,144,3,1,267,149,420,8.48e-78,241
