# Setup

In [None]:
import os
import pickle

import numpy as np
import pandas as pd
import scipy

import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px

from tqdm.notebook import tqdm, trange


from pyphylon.pangenome import estimate_pan_core_size, fit_heaps_by_iteration
from pyphylon.util import load_config

In [None]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES = CONFIG["PG_NAME"]

In [None]:
import random

random.seed(42) # set random seed for reproducibility

In [None]:
matplotlib.rcParams['pdf.fonttype'] = 42
plt.rcParams["figure.dpi"] = 300
sns.set_palette("deep")
sns.set_context("paper")
sns.set_style("whitegrid")

In [None]:
df_genes = pd.read_pickle(os.path.join(WORKDIR, f'processed/cd-hit-results/{SPECIES}_strain_by_gene.pickle.gz'))
df_genes.fillna(0, inplace=True)
df_genes = df_genes.sparse.to_dense().astype('int8')

display(
    df_genes.shape,
    df_genes.head()
)

In [None]:
metadata = pd.read_csv(os.path.join(WORKDIR, 'interim/mash_scrubbed_species_metadata_2b.csv'), index_col=0, dtype='object')

display(
    metadata.shape,
    metadata.head()
)

In [None]:
# Filter metadata for Complete sequences only
metadata_complete = metadata[metadata.genome_status == 'Complete'] # filter for only Complete sequences

# Filter P matrix for Complete sequences only
df_genes_complete = df_genes[metadata_complete.genome_id].copy()
inCompleteseqs = df_genes_complete.sum(axis=1) > 0 # filter for genes found in complete sequences
df_genes_complete = df_genes_complete[inCompleteseqs]

df_genes_complete.shape

# Heaps' Law Plot for CAR genomes

## Total (Complete + WGS)

In [None]:
# Generate sparse dataframe (needed for function to work)
df_genes_sparse = df_genes.astype(pd.SparseDtype("int8", 0))

# Estimate pan/core curve
df_pan_core = estimate_pan_core_size(df_genes_sparse, num_iter=20, log_batch=1) # generate pan/core size curves

In [None]:
df_pan_core

In [None]:
output_pan = fit_heaps_by_iteration(df_pan_core, section='pan')
output_acc = fit_heaps_by_iteration(df_pan_core, section='acc')
output_rare = fit_heaps_by_iteration(df_pan_core, section='rare')

In [None]:
output_core = fit_heaps_by_iteration(df_pan_core, section='core')


In [None]:
# Heaps' Law coefficient
# This determines how open/closed your pangenome is
output_pan.lambda_.mean()

In [None]:
x = list(range(1, df_genes.shape[1]+1))

y_core = output_core.kappa.mean() * np.array(x) ** output_core.lambda_.mean()
y_acc = output_acc.kappa.mean() * np.array(x) ** output_acc.lambda_.mean()
y_rare = output_rare.kappa.mean() * np.array(x) ** output_rare.lambda_.mean()

In [None]:
# Log-linear plot
fig, ax = plt.subplots()

ax.stackplot(x, y_core, y_acc, y_rare)
ax.set_yscale('log')
ax.grid(False)
plt.show()