In [1]:
import pickle
import os

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
plt.rcParams["figure.dpi"] = 200
sns.set_palette("deep")
sns.set_context("paper")
sns.set_style("whitegrid")
from pyphylon.util import load_config

import gzip

In [2]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES = CONFIG["PG_NAME"]

In [3]:
mash_scrubbed_metadata = pd.read_csv(os.path.join(WORKDIR, 'interim/mash_scrubbed_species_metadata_2b.csv'), index_col=0, dtype='object')

display(
    mash_scrubbed_metadata.shape,
    mash_scrubbed_metadata.head()
)

(3, 66)

Unnamed: 0,genome_id,genome_name,organism_name,taxon_id,genome_status,strain,serovar,biovar,pathovar,mlst,...,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease,comments,additional_metadata
0,1314.4067,Streptococcus pyogenes 1042,,1314,Complete,1042,,,,MLST.Streptococcus_pyogenes.530,...,,,,,,,,,,sample_type:Pure culture
1,1314.4068,Streptococcus pyogenes 1039,,1314,Complete,1039,,,,,...,,,,,,,,,,sample_type:Pure culture
2,1314.407,Streptococcus pyogenes 1004,,1314,Complete,1004,,,,MLST.Streptococcus_pyogenes.530,...,,,,,,,,,,sample_type:Pure culture


# Run Panaroo
Run the annotation and panaroo pipeline provided in the workflows. This notebook will process the files to format them for the rest of the pipeline

# Process Panaroo Results

In [4]:
import networkx as nx
from pyphylon.pangenome import *

In [61]:
def update_pangenome_faa_file(filepath, output_filepath, panaroo_to_pangneome_map, centroids):
    with open(filepath, 'r') as input_fasta:
        with open(output_filepath, 'w') as output_fasta:
            with open(output_filepath + '_representative_sequences', 'w') as output_representative_fasta:
                representative_sequence = False
                for line in input_fasta:
                    if line[0] == '>':
                        output_fasta.write('>' + panaroo_to_pangneome_map[line[1:].strip()] + '\n')
                        if line[1:].strip() in centroids:
                            representative_sequence = True
                            output_representative_fasta.write('>' + centroids[line[1:].strip()] + '\n')
                        else:
                            representative_sequence = False
                    else:
                        output_fasta.write(line)
                        if representative_sequence:
                            output_representative_fasta.write(line)

def update_gff(filepath, output_filepath, panaroo_to_pangneome_map):
    with open(filepath, 'r') as input_gff:
        with open(output_filepath, 'w') as output_gff:
            for line in input_gff:
                if line[0] == '#':
                    output_gff.write(line)
                else:
                    if 'panaroo_ID=' in line:
                        panaroo_ID = [x for x in line.strip().split()[-1].split(';') if 'panaroo_ID' in x][0]
                        panaroo_ID = panaroo_ID.split('=')[1]
                        
                        pangenome_allele = panaroo_to_pangneome_map[panaroo_ID]
                        pangenome_cluster = pangenome_allele[:pangenome_allele.rfind('A')]

                        line = line.strip()
                        line = line + ';' + 'pangenome_allele=' + pangenome_allele + ';' + 'pangenome_cluster=' + pangenome_cluster

                        output_gff.write(line + '\n')

def process_panaroo_results(SPECIES, panaroo_results_path):
    print('Loading Panaroo data ...')
    panaroo_df_genes = pd.read_csv(os.path.join(panaroo_results_path, 'gene_presence_absence.csv'), index_col='Gene', low_memory = False, dtype=object).drop(['Non-unique Gene name', 'Annotation'], axis=1)
    gene_information = pd.read_csv(os.path.join(panaroo_results_path, 'gene_data.csv'), dtype=object)
    genomes = list(panaroo_df_genes.columns)
    graph = nx.read_gml(os.path.join(panaroo_results_path, 'final_graph.gml'))

    print("Generating P matrix ...")
    ## generate P matrix from graph ##
    panaroo_to_clusters = {} # map of names of genes in panaroo output to new names for pangenome
    panaroo_to_ids = {} # map of panaroo gene names to ids
    cluster_count = 0
    
    P = np.zeros((len(graph.nodes), len(genomes)))
    
    for node in tqdm(list(graph.nodes)[::-1]):
        node = graph.nodes[node]
        panaroo_to_clusters[node['name']] = SPECIES + '_' + str(cluster_count)
        panaroo_to_ids[node['name']] = node['geneIDs'].split(';')
    
        # create allelic information here
        
        genome_ids = node['genomeIDs'].split(';')
        genome_indices = list(map(int, genome_ids))  # Convert map to list
        P[cluster_count, genome_indices] = 1
        
        cluster_count += 1 # increment the clusters for naming
    
    P = pd.DataFrame(P, index = panaroo_to_clusters.values(), columns = genomes, dtype=int)

    print("Retrieving allele information ...")
    ## Generate P_allele matrix from graph ##
    p_allele_dict = {}
    alleles_to_tags = {}
    for group, clustering_ids in tqdm(panaroo_to_ids.items()):
        pangenome_id = panaroo_to_clusters[group]
        p_allele_dict[pangenome_id] = {}
        gene_info_temp = gene_information[gene_information.clustering_id.isin(clustering_ids)]
    
        allele_count = 0 # iterate over all unique alleles and add to this based on how many have been processed
        for value, allele_group in gene_info_temp.groupby('prot_sequence'):
            p_allele_dict[pangenome_id][pangenome_id + 'A' + str(allele_count)] = list(allele_group.gff_file.values)
            alleles_to_tags[pangenome_id + 'A' + str(allele_count)] = (';').join(list(allele_group.clustering_id.values))
            allele_count += 1

    total_alleles = 0
    allele_list = []
    for gene, alleles in p_allele_dict.items():
        allele_list += alleles
        total_alleles += len(alleles)

    print("Generating P_allele matrix ...")
    P_allele = np.zeros((total_alleles, len(genomes)))
    i = 0
    for gene, alleles in tqdm(p_allele_dict.items()):
        for allele, allele_genomes in alleles.items():
            indices = [index for index, value in enumerate(genomes) if value in allele_genomes]
            P_allele[i, indices] = 1
            i+=1
    
    P_allele = pd.DataFrame(P_allele, index = allele_list, columns = genomes, dtype = int)

    ##  Update panaroo pangenome faa files with the updated pangenome names
    print('Updating sequence names in pangenome .faa files ...')
    tags_to_pangenome_alleles = {x:a for a, tags in alleles_to_tags.items() for x in tags.split(';')}
    centroids = {}
    for node in graph.nodes:
        node = graph.nodes[node]
        centroids[node['centroid']] = panaroo_to_clusters[node['name']]
    update_pangenome_faa_file(os.path.join(panaroo_results_path, 'combined_protein_CDS.fasta'), 
                              os.path.join(panaroo_results_path, SPECIES), tags_to_pangenome_alleles, centroids)

    ## Update the gff files for panaroo
    print('Updating panaroo gff files with updated sequence names ...')
    if not os.path.isdir(os.path.join(panaroo_results_path, 'gffs')):
        os.makedirs(os.path.join(panaroo_results_path, 'gffs'))
    
    for genome in P.columns:
        update_gff(os.path.join(panaroo_results_path, 'postpanaroo_gffs', genome + '_panaroo.gff'),
                  os.path.join(panaroo_results_path, 'gffs', genome + '.gff'),
                  tags_to_pangenome_alleles)
    
    ## Update pangenome graph
    # add new pangenome cluster names to graph
    # TODO consider adding list of allele names as well (updated with pangenome allele structure)
    print('Updating Graph ...')
    for node in graph.nodes:
        node = graph.nodes[node]
        node['pangenome_name'] = panaroo_to_clusters[node['name']]
        node['genomes'] = ';'.join([genomes[int(i)] for i  in node['genomeIDs'].split(';')])
        node['alleles'] = ';'.join([tags_to_pangenome_alleles[x] for x in node['geneIDs'].split(';')])
        node['centroid_allele'] = tags_to_pangenome_alleles[node['centroid'].split(';')[0]]

    ## output the relevant matrices and tables
    print('Saving results to:', panaroo_results_path)
    # P matrix of genes by strains, sparsely
    P.astype(pd.SparseDtype(int, fill_value=0)).to_pickle(os.path.join(panaroo_results_path, SPECIES + '_strain_by_gene.pickle.gz'))
    # P Allele matrix of alleles by strains, sparsely
    P_allele.astype(pd.SparseDtype(int, fill_value=0)).to_pickle(os.path.join(panaroo_results_path, SPECIES + '_strain_by_allele.pickle.gz'))
    # mapping of panaroo group names to pangenome names for gene clusters
    pd.Series(panaroo_to_clusters).to_csv(os.path.join(panaroo_results_path, SPECIES + '_panaroo_names_to_pangenome.csv'), header=None)
    # mapping of pangenome allele names to panaroo ids for genes
    pd.Series(alleles_to_tags).to_csv(os.path.join(panaroo_results_path, SPECIES + '_pangenome_alleles_to_tags.csv'), header=None)
    # output updated graph
    nx.write_gml(graph, os.path.join(panaroo_results_path, SPECIES + '_graph.gml'))

    return P, P_allele, graph

In [62]:
df_genes, df_alleles, pangenome_graph = process_panaroo_results(SPECIES, os.path.join(WORKDIR, 'processed', 'panaroo_results', SPECIES))

Loading Panaroo data ...
Generating P matrix ...


  0%|          | 0/1815 [00:00<?, ?it/s]

Retrieving allele information ...


  0%|          | 0/1815 [00:00<?, ?it/s]

Generating P_allele matrix ...


  0%|          | 0/1815 [00:00<?, ?it/s]

Updating sequence names in pangenome .faa files ...
Updating panaroo gff files with updated sequence names ...
Updating Graph ...
Saving results to: data/processed/panaroo_results/SPyogenes
