# 1b. Download (first-pass) filtered genomes

In this notebook, we will use __`pyphylon`__'s `download` module to download candidate genomes for pangenome generation.

In this example we will select genomes for download from [BV-BRC](https://www.bv-brc.org/)

## Setup

In [11]:
import os
import pandas as pd

from tqdm.notebook import tqdm

from pyphylon.downloads import download_genomes_bvbrc
from pyphylon.util import remove_empty_files

In [2]:
# Make output directories
RAW = 'data/raw'
RAW_GENOMES = 'data/raw/genomes'
if not os.path.exists(RAW):
    os.makedirs('data/raw')

if not os.path.exists(RAW_GENOMES):
    os.makedirs(RAW_GENOMES)

In [3]:
filtered_species_summary = pd.read_pickle('data/interim/genome_summary_1a.pickle')
filtered_species_summary

Unnamed: 0,genome_id,genome_name,taxon_id,genome_status,genome_length,gc_content,contig_l50,contig_n50,chromosomes,plasmids,contigs,patric_cds,refseq_cds,trna,rrnacoarse_consistency,fine_consistency,checkm_completeness,checkm_contamination,genome_qualitydate_created,date_modified
637182,1314.4070,Streptococcus pyogenes 1004,1314,Complete,1842099,38.301960,1,1,1,0,1,1798,1706,67,,99.9,100.0,0.0,,2023-04-21T00:08:53.590Z
637183,1314.4068,Streptococcus pyogenes 1039,1314,Complete,1856557,38.305910,1,1,1,0,1,1827,1734,67,,99.8,100.0,0.0,,2023-04-21T00:00:24.086Z
637184,1314.4067,Streptococcus pyogenes 1042,1314,Complete,1853582,38.259544,1,1,1,0,1,1809,1719,67,,99.9,100.0,0.0,,2023-04-21T00:00:22.678Z
637185,1314.4071,Streptococcus pyogenes 1044,1314,Complete,2008188,38.502570,1,1,1,0,1,2090,1980,67,,99.6,100.0,1.9,,2023-04-21T03:00:45.142Z
637186,1314.3939,Streptococcus pyogenes 1095,1314,Complete,1854315,38.254448,1,1,1,0,1,1813,1718,67,,99.9,100.0,0.0,,2023-02-15T03:12:05.857Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639438,1314.1009,Streptococcus pyogenes strain emmNA,1314,Complete,1788166,38.554363,1,1,1,,1,1834,1748,67,,99.7,,,,2019-07-16T00:21:15.205Z
639439,1314.994,Streptococcus pyogenes strain emmSTG866.1,1314,Complete,1816007,38.416317,1,1,1,,1,1816,1736,67,,99.8,,,,2019-07-16T00:16:56.270Z
639455,1314.2849,Streptococcus pyogenes strain iGAS376,1314,Complete,1897124,38.601380,1,1,1,0,1,1936,1824,66,,99.6,,,,2021-02-01T04:33:04.021Z
639456,1314.2850,Streptococcus pyogenes strain iGAS391,1314,Complete,1897129,38.600906,1,1,1,0,1,1933,1824,66,,99.4,,,,2021-02-01T04:32:39.761Z


In [14]:
filtered_species_metadata = pd.read_pickle('data/interim/genome_metadata_1a.pickle')
filtered_species_metadata

Unnamed: 0,genome_id,genome_name,organism_name,taxon_id,genome_status,strain,serovar,biovar,pathovar,mlst,...,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease,comments,additional_metadata
637182,1314.4070,Streptococcus pyogenes 1004,,1314,Complete,1004,,,,MLST.Streptococcus_pyogenes.530,...,,,,,,,,,,sample_type:Pure culture
637183,1314.4068,Streptococcus pyogenes 1039,,1314,Complete,1039,,,,,...,,,,,,,,,,sample_type:Pure culture
637184,1314.4067,Streptococcus pyogenes 1042,,1314,Complete,1042,,,,MLST.Streptococcus_pyogenes.530,...,,,,,,,,,,sample_type:Pure culture
637185,1314.4071,Streptococcus pyogenes 1044,,1314,Complete,1044,,,,MLST.Streptococcus_pyogenes.28,...,,,,,,,,,,sample_type:Pure culture
637186,1314.3939,Streptococcus pyogenes 1095,,1314,Complete,1095,,,,MLST.Streptococcus_pyogenes.530,...,,,,,,,,,,sample_type:Pure cultured organism;biomaterial...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639438,1314.1009,Streptococcus pyogenes strain emmNA,,1314,Complete,emmNA,,,,MLST.Streptococcus_pyogenes.612,...,,,,,,,,,Complete genomes of 30 globally distributed Gr...,collected_by:Davies_et_al
639439,1314.994,Streptococcus pyogenes strain emmSTG866.1,,1314,Complete,emmSTG866.1,,,,MLST.Streptococcus_pyogenes.450,...,,,,,,,,,Complete genomes of 30 globally distributed Gr...,collected_by:Davies_et_al
639455,1314.2849,Streptococcus pyogenes strain iGAS376,,1314,Complete,iGAS376,,,,MLST.Streptococcus_pyogenes.99,...,,,,,,,,,Complete genomes of three invasive isolates of...,"collected_by:Scottish Haemophilus, Legionella,..."
639456,1314.2850,Streptococcus pyogenes strain iGAS391,,1314,Complete,iGAS391,,,,MLST.Streptococcus_pyogenes.99,...,,,,,,,,,Complete genomes of three invasive isolates of...,"collected_by:Scottish Haemophilus, Legionella,..."


## Download

In [4]:
bad_genomes = download_genomes_bvbrc(
    genomes=filtered_species_summary["genome_id"],
    output_dir=RAW_GENOMES,
    filetypes=['fna']
)

Processing filetypes...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading selected files...:   0%|          | 0/257 [00:00<?, ?it/s]

2024-05-23 16:52:06,494 - INFO - File data/raw/genomes/fna/1314.4070.fna already exists and force is False. Skipping download.
2024-05-23 16:52:06,495 - INFO - File data/raw/genomes/fna/1314.4068.fna already exists and force is False. Skipping download.
2024-05-23 16:52:06,495 - INFO - File data/raw/genomes/fna/1314.4067.fna already exists and force is False. Skipping download.
2024-05-23 16:52:06,495 - INFO - ftp://ftp.bvbrc.org/genomes/1314.4071/1314.4071.fna -> data/raw/genomes/fna/1314.4071.fna
2024-05-23 16:52:19,509 - INFO - ftp://ftp.bvbrc.org/genomes/1314.3939/1314.3939.fna -> data/raw/genomes/fna/1314.3939.fna
2024-05-23 16:52:32,520 - INFO - ftp://ftp.bvbrc.org/genomes/1314.4069/1314.4069.fna -> data/raw/genomes/fna/1314.4069.fna
2024-05-23 16:52:44,800 - INFO - ftp://ftp.bvbrc.org/genomes/1314.134/1314.134.fna -> data/raw/genomes/fna/1314.134.fna
2024-05-23 16:52:57,195 - INFO - ftp://ftp.bvbrc.org/genomes/1314.3322/1314.3322.fna -> data/raw/genomes/fna/1314.3322.fna
2024-05

Removing bad genome files...: 0it [00:00, ?it/s]

In [12]:
empty_files = []
for subdir in tqdm(os.listdir(RAW_GENOMES)):
    subdir_path = os.path.join(RAW_GENOMES, subdir)
    files = remove_empty_files(subdir_path)
    empty_files.extend(files)

  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
print(f"bad genomes: {len(bad_genomes)}")
print(f"empty genomes: {len(empty_files)}")

bad genomes: 0
empty genomes: 0


## Update genome info files

In [15]:
filtered_species_summary['genome_id'] = filtered_species_summary.genome_id.astype('str')
downloaded_genomes = set(filtered_species_summary.genome_id) - set(bad_genomes)

filtered_species_summary = (filtered_species_summary.
                            drop_duplicates(subset=['genome_id']).
                            set_index('genome_id').
                            loc[sorted(downloaded_genomes)].
                            reset_index())


display(
    filtered_species_summary.shape,
    filtered_species_summary.head()
)

(257, 20)

Unnamed: 0,genome_id,genome_name,taxon_id,genome_status,genome_length,gc_content,contig_l50,contig_n50,chromosomes,plasmids,contigs,patric_cds,refseq_cds,trna,rrnacoarse_consistency,fine_consistency,checkm_completeness,checkm_contamination,genome_qualitydate_created,date_modified
0,1010840.4,Streptococcus pyogenes MGAS1882,1010840,Complete,1781029,38.0,1,1,1,0.0,1,1727,0,57,,99.5,100.0,0.0,,2015-03-16T03:17:09.594Z
1,1048264.3,Streptococcus pyogenes HKU QMH11M0907901,1048264,Complete,1908100,38.45,1,1,1,,1,1909,1865,67,,99.9,100.0,0.9,,2016-01-17T15:29:01.552Z
2,1150773.3,Streptococcus pyogenes JRS4,1150773,Complete,1811968,38.63,1,1,1,,1,1811,1671,67,,99.8,100.0,0.0,,2016-01-17T16:03:54.402Z
3,1150773.4,Streptococcus pyogenes JRS4,1150773,Complete,1811124,38.64,1,1,1,,1,1886,1890,66,,99.9,100.0,0.0,,2016-03-01T06:31:23.641Z
4,1207470.4,Streptococcus pyogenes M1 476,1207470,Complete,1831079,38.5,1,1,1,0.0,1,1929,1849,57,,97.9,100.0,5.0,,2015-03-16T03:17:09.594Z


In [16]:
filtered_species_metadata['genome_id'] = filtered_species_metadata.genome_id.astype('str')

filtered_species_metadata = (filtered_species_metadata.
                            drop_duplicates(subset=['genome_id']).
                            set_index('genome_id').
                            loc[sorted(downloaded_genomes)].
                            reset_index())


display(
    filtered_species_metadata.shape,
    filtered_species_metadata.head()
)

(257, 66)

Unnamed: 0,genome_id,genome_name,organism_name,taxon_id,genome_status,strain,serovar,biovar,pathovar,mlst,...,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease,comments,additional_metadata
0,1010840.4,Streptococcus pyogenes MGAS1882,Streptococcus pyogenes MGAS1882,1010840,Complete,MGAS1882,,,,MLST.Streptococcus_pyogenes.172,...,,,,,,,Host,,-,
1,1048264.3,Streptococcus pyogenes HKU QMH11M0907901,,1048264,Complete,HKU QMH11M0907901,,,,MLST.Streptococcus_pyogenes.36,...,,,,,,,,,Clinical use of next generation sequencing for...,
2,1150773.3,Streptococcus pyogenes JRS4,,1150773,Complete,JRS4,serovar emm6,,,MLST.Streptococcus_pyogenes.37,...,,,,C,,,,,We report the complete genome assemblies of th...,collected_by:Rockefeller University Lancefield...
3,1150773.4,Streptococcus pyogenes JRS4,,1150773,Complete,JRS4,,,,MLST.Streptococcus_pyogenes.37,...,No,,,,,,,Pharyngitis,Complete genome sequence of the highly invasiv...,
4,1207470.4,Streptococcus pyogenes M1 476,Streptococcus pyogenes M1 476,1207470,Complete,476,,,,MLST.Streptococcus_pyogenes.28,...,,,,,,,,Toxic shock syndrome,We report the completely annotated genome sequ...,


In [17]:
# Save files
filtered_species_metadata.to_pickle('data/interim/genome_summary_1b.pickle')
filtered_species_metadata.to_pickle('data/interim/genome_metadata_1b.pickle')