In [2]:
import os
import pandas as pd

from tqdm.notebook import tqdm

from pyphylon.downloads import download_genomes_bvbrc
from pyphylon.util import remove_empty_files, load_config

In [3]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES = CONFIG['PG_NAME']

GENOMES_FILE = CONFIG["GENOMES_FILE"].replace('/examples/', '') # to accomodate for running this in docker
METADATA_FILE = CONFIG["METADATA_FILE"].replace('/examples/', '') # to accomodate for running this in docker

In [4]:
# Make output directories for new genomes
UPDATE_PANGENOME = os.path.join(WORKDIR, "update_pangenome")
RAW_GENOMES = os.path.join(UPDATE_PANGENOME, "genomes")
if not os.path.exists(UPDATE_PANGENOME):
    os.makedirs(UPDATE_PANGENOME)

if not os.path.exists(RAW_GENOMES):
    os.makedirs(RAW_GENOMES)

In [6]:
# New genomes to download, you would want to ensure your seqeunces passed quality control before you add them to your pangenome
genomes = ['1314.1009', '1314.994', '1314.2849', '1314.2850', '1314.2848']

summary = pd.read_csv(GENOMES_FILE, index_col=0, dtype={'genome_id':str}, sep='\t')
metadata = pd.read_csv(METADATA_FILE, index_col=0, dtype={'genome_id':str}, sep='\t')

metadata.loc[genomes].to_csv(UPDATE_PANGENOME + '/new_genomes_metadata.csv')

In [12]:
bad_genomes = download_genomes_bvbrc(
    genomes=genomes,
    output_dir=RAW_GENOMES,
    filetypes=['fna']
)

Processing filetypes...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading selected files...:   0%|          | 0/5 [00:00<?, ?it/s]

2025-02-28 14:37:34,050 - INFO - ftp://ftp.bvbrc.org/genomes/1314.1009/1314.1009.fna -> data/update_pangenome/genomes/fna/1314.1009.fna
2025-02-28 14:37:45,876 - INFO - ftp://ftp.bvbrc.org/genomes/1314.994/1314.994.fna -> data/update_pangenome/genomes/fna/1314.994.fna
2025-02-28 14:37:57,754 - INFO - ftp://ftp.bvbrc.org/genomes/1314.2849/1314.2849.fna -> data/update_pangenome/genomes/fna/1314.2849.fna
2025-02-28 14:38:09,206 - INFO - ftp://ftp.bvbrc.org/genomes/1314.2850/1314.2850.fna -> data/update_pangenome/genomes/fna/1314.2850.fna
2025-02-28 14:38:22,921 - INFO - ftp://ftp.bvbrc.org/genomes/1314.2848/1314.2848.fna -> data/update_pangenome/genomes/fna/1314.2848.fna


Removing bad genome files...: 0it [00:00, ?it/s]

In [13]:
empty_files = []
for subdir in tqdm(os.listdir(RAW_GENOMES)):
    subdir_path = os.path.join(RAW_GENOMES, subdir)
    files = remove_empty_files(subdir_path)
    empty_files.extend(files)

  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
print(f"bad genomes: {len(bad_genomes)}")
print(f"empty genomes: {len(empty_files)}")

bad genomes: 0
empty genomes: 0


# Extract the representative sequences for each of your pangenome gene clusters (if you haven't alread)

In [18]:
# function to extract the representative alleles for each gene cluster
from pyphylon.blast_utils import *
extract_reference_sequences(WORKDIR + 'processed/cd-hit-results', SPECIES, WORKDIR + 'processed/cd-hit-results/' + SPECIES + '_representative_sequences')

# Run scripts to update your pangenome
Run the "add_strains_to_pangenome_workflow" to annotate the strains with bakta and mlst, and then add them to the pangenome with CD-HIT-2D