In [None]:
import pickle
import os

import pandas as pd
from tqdm.notebook import tqdm

from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
plt.rcParams["figure.dpi"] = 200
sns.set_palette("deep")
sns.set_context("paper")
sns.set_style("whitegrid")
from pyphylon.util import load_config

In [None]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES = CONFIG["PG_NAME"]

In [None]:
import gzip

In [None]:
mash_scrubbed_metadata = pd.read_csv(os.path.join(WORKDIR, 'interim/mash_scrubbed_species_metadata_2b.csv'), index_col=0, dtype='object')

display(
    mash_scrubbed_metadata.shape,
    mash_scrubbed_metadata.head()
)

In [None]:
# List of bakta-annotated faa files (needed for CD-HIT)
# bakta is a software that annotate bacterial genomes, MAGs and plasmids
# MAGs: metagenome-assembled genome, genetic material that's directly from environmental samples (collective material from the microbial communities)
# advantage of bakta, dbxref-rich, database cross reference
# sORF: small open reading frame, has a AUG and a stop codon, short sequences that have the potential to encode small peptides

BAKTA = os.path.join(WORKDIR, 'processed/bakta/')

bakta_faa_paths = [
    os.path.join(BAKTA, bakta_folder, bakta_folder+'.faa') 
    for bakta_folder in os.listdir(BAKTA)
]

bakta_faa_paths[:5]

In [None]:
# Sanity check
for path in tqdm(bakta_faa_paths):
    assert os.path.isfile(path)

In [None]:
# ensure that bakta paths are in our PG
real_paths = []
for f in bakta_faa_paths:
    for i in mash_scrubbed_metadata['genome_id'].tolist():
        if i in f:
            print(f)    
            real_paths.append(f)

In [None]:
len(real_paths)

In [None]:
from pyphylon.pangenome import build_cds_pangenome

df_alleles, df_genes, header_to_allele = build_cds_pangenome(
    genome_faa_paths=real_paths,
    output_dir=os.path.join(WORKDIR, 'processed/cd-hit-results/'),
    name=SPECIES,
    cdhit_args={'-n': 5, '-c':0.8, '-aL':0.8, '-T': 0, '-M': 0},
    fastasort_path=None,
    save_csv=False
)

In [None]:
df_genes.sum()

In [None]:
sns.clustermap(df_genes.fillna(0).transpose())