In [None]:
# Base imports
import os
import pickle

# Compute imports
import numpy as np
import pandas as pd
import scipy
from tqdm.notebook import tqdm, trange

# Plotting imports
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
from matplotlib import pyplot as plt
import seaborn as sns
from plotly import express as px

import pandas as pd
# ML import
#from sklearn.decomposition import NMF
#from sklearn.metrics import mean_squared_error, median_absolute_error

from pyphylon.pangenome import find_pangenome_segments
from pyphylon.util import load_config

In [None]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES = CONFIG["PG_NAME"]

In [None]:
df_genes = pd.read_pickle(os.path.join(WORKDIR, f'processed/cd-hit-results/{SPECIES}_strain_by_gene.pickle.gz'))
df_genes.fillna(0, inplace=True)
df_genes = df_genes.sparse.to_dense().astype('int8')

display(
    df_genes.shape,
    df_genes.head()
)

In [None]:
metadata = pd.read_csv(os.path.join(WORKDIR, 'interim/mash_scrubbed_species_metadata_2b.csv'), index_col=0, dtype='object')

display(
    metadata.shape,
    metadata.head()
)

In [None]:
# Filter metadata for Complete sequences only
metadata_complete = metadata[metadata.genome_status == 'Complete'] # filter for only Complete sequences

# Filter P matrix for Complete sequences only
df_genes_complete = df_genes[metadata_complete.genome_id]
inCompleteseqs = df_genes_complete.sum(axis=1) > 0 # filter for genes found in complete sequences
df_genes_complete = df_genes_complete[inCompleteseqs]

df_genes_complete.shape   

In [None]:
df_gene_freq = df_genes.sum(axis=1)
df_gene_freq.hist()
fig, ax = plt.subplots()
sns.histplot(df_gene_freq, binwidth=50, ax=ax)
plt.yscale('log')
plt.show()

In [None]:
fig, ax = plt.subplots()

segments, popt, r_squared, mae, ax = find_pangenome_segments(df_genes, threshold=0.1, ax=ax)

In [None]:
df_freq = df_genes.sum(axis=1)

df_core = df_genes[df_freq > np.floor(segments[0])]
df_rare = df_genes[df_freq < np.ceil(segments[1])]

acc_gene_list = list(set(df_genes.index)
                     - set(df_core.index)
                     - set(df_rare.index)
                    )

df_acc = df_genes.loc[acc_gene_list].copy()

display(
    df_core.shape,
    df_acc.shape,
    df_rare.shape
)

# Save Results


In [None]:
# Total
newpath = os.path.join(WORKDIR, 'processed/CAR_genomes/') 
if not os.path.exists(newpath):
    os.makedirs(newpath)
df_core.to_csv(os.path.join(WORKDIR, 'processed/CAR_genomes/df_core.csv'))
df_acc.to_csv(os.path.join(WORKDIR, 'processed/CAR_genomes/df_acc.csv'))
df_rare.to_csv(os.path.join(WORKDIR, 'processed/CAR_genomes/df_rare.csv'))