# 1a. Filter for high-quality genomes to download

In this notebook, we will use __`pyphylon`__'s `download` and `qcqa` modules to select candidate genomes to download for pangenome generation.

In this example we will select genomes for download from [BV-BRC](https://www.bv-brc.org/)

## Setup

In [None]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

from pyphylon.downloads import get_scaffold_n50_for_species
from pyphylon.util import load_config
import pyphylon.qcqa as qcqa
import os

In [None]:
plt.rcParams["figure.dpi"] = 200
sns.set_palette("deep")
sns.set_context("paper")
sns.set_style("whitegrid")


In [None]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES_NAME = CONFIG["SPECIES_NAME"]
GENOMES_FILE = CONFIG["GENOMES_FILE"].replace('/examples/', '') # to accomodate for running this in docker
METADATA_FILE = CONFIG["METADATA_FILE"].replace('/examples/', '') # to accomodate for running this in docker
DEBUG = CONFIG["DEBUG"]

In [None]:
summary = pd.read_csv(GENOMES_FILE, index_col=0, dtype={'genome_id':str}, sep='\t')
metadata = pd.read_csv(METADATA_FILE, index_col=0, dtype={'genome_id':str}, sep='\t')

summary.shape

In [None]:
summary.genome_name.str.contains(SPECIES_NAME).sum()

# Filter metadata for species of interest

In [None]:
# How many strains of the species/genus are available
species_summary = qcqa.filter_by_species(summary, CONFIG['SPECIES_NAME'])
metadata_summary = qcqa.filter_by_species(metadata, CONFIG['SPECIES_NAME'])

display(
    species_summary.shape,
    species_summary.head()
)


## Plot unfiltered dataset

In [None]:
# Find the scaffold N50 score of the reference genome for the organism of interest
# Either visit the NCBI website or retrieve it using the following method (~20 seconds)
scaffold_n50 = get_scaffold_n50_for_species(species_summary.taxon_id.mode().values[0])
scaffold_n50

In [None]:
# Initial unfiltered strain plot
h = sns.jointplot(
    data=species_summary,
    x="genome_length",
    y="patric_cds",
    hue="genome_status",
    alpha=0.75,
    height=4
)

h.ax_joint.legend(
    title='BV-BRC\nstrain type',
)

h.ax_joint.set_xlabel("genome length")
h.ax_joint.set_ylabel("BV-BRC predicted gene count")
plt.show()

In [None]:
# Find reference strain N50 value from NCBI Genome and multiply by 0.85
# If your species/genus has multiple reference strains, pick the smallest by genome length

# Only applies for Complete sequences
species_complete_summary = species_summary[species_summary.genome_status == 'Complete']

fig, ax = plt.subplots()

# Set threshold as 0.85 * Scaffold N50 score
species_ref_n50 = scaffold_n50
min_thresh_n50 = int(0.85 * species_ref_n50)

# Most (if not all) Complete sequences pass this threshold
sns.histplot(species_complete_summary.contig_n50.dropna().astype('int'), ax=ax)
plt.axvline(x=min_thresh_n50, color='#ff00ff', linestyle='--')

## Initial Filtration Report

In [None]:
# Complete sequences get filtered by their N50 and L50 scores
# Other WGS sequences get filtered by their contig count and CheckM
# contaminaion & completeness metrics
filtered_species_summary, df_filtration = qcqa.filter_by_genome_quality(
    species_summary,
    min_thresh_n50=min_thresh_n50,
    max_contig=None,
    contamination_cutoff=None,
    completeness_cutoff=None,
    return_stats=True,
)

display(
    f'Filtered Strains:',
    filtered_species_summary.shape,
    f'------------------------------',
    f'Filtration Report',
    df_filtration
)

In [None]:
# Same initial plot but with only (first-pass) filtered strains
# For this plot, make sure your WGS sequences form a nice line
# Complete sequences may be all over the place

# For this example, we don't need to filter our data based on the
# distribution we see below

h = sns.jointplot(
    data=filtered_species_summary,
    x="genome_length",
    y="patric_cds",
    hue="genome_status",
    alpha=0.75,
    height=4
)

h.ax_joint.legend(
 title='BV-BRC\nstrain type'
)

h.ax_joint.set_xlabel("genome length")
h.ax_joint.set_ylabel("BV-BRC predicted gene count")
plt.show()

In [None]:
# Ensure GC content makes sense
# Remove any big outliers
# (in this case nothing needs filtering)

h = sns.jointplot(
    data=filtered_species_summary,
    x="gc_content",
    y="contigs",
    hue="genome_status",
    alpha=0.75,
    height=4
)

h.ax_joint.legend(
    title='BV-BRC\nstrain type',
    bbox_to_anchor=(1.45,1.4)
)

h.ax_joint.set_xlabel("GC Content")
h.ax_joint.set_ylabel("number of contigs")
plt.show()

## Save (first-pass) filtered genome info files for download

In [None]:
if DEBUG:
    filtered_species_summary = filtered_species_summary[:50]
    # filtered_species_metadata = filtered_species_metadata.loc[species_summary[:10].index,:]

In [None]:
filtered_species_metadata = metadata.loc[filtered_species_summary.index]
filtered_species_metadata

In [None]:
newpath = os.path.join(WORKDIR, 'interim') 
if not os.path.exists(newpath):
    os.makedirs(newpath)
filtered_species_summary.to_csv(os.path.join(WORKDIR, 'interim/genome_summary_1a.csv'))
filtered_species_metadata.to_csv(os.path.join(WORKDIR,'interim/genome_metadata_1a.csv'))

In [None]:
df_filtration.to_csv(os.path.join(WORKDIR, 'interim/df_filtration_1a.csv'))