In [1]:
import pandas as pd
import numpy as np
import os
import json
import copy
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
import Dokdonia_code as Dc
# plt.style.use('plt_styles.mplstyle')
%matplotlib inline

In [2]:
# Prepara plot data
p_Data_paths, cluster_data, clusters = {}, {}, {}
data_types = ['TC', 'TPM']
res_ids = ['CLUSTER_ALL_GENES_TRANSCRIPT_CELL', 'CLUSTER_ALL_GENES_TPM']

for data_type, res_id in zip(data_types, res_ids):
    
    p_KEGG_paths = Dc.readFromPickleFile(
        path_to_file=f'Results/Permutation_analysis/p_KEGG_paths_500000_{res_id}.pkl')
    p_PATRIC_paths = Dc.readFromPickleFile(
        path_to_file=f'Results/Permutation_analysis/p_PATRIC_paths_500000_{res_id}.pkl')
    c_data = pd.read_csv(os.path.join(
        os.getcwd(),f'Results/{res_id}/Processed_Data/clust_input.tsv_processed.tsv'),
                        sep='\t', index_col='Genes')
    # Add fake column to separate datasets
    c_data.insert(4, '', [np.nan for n in range(c_data.shape[0])])

    clusts = Dc.readFromPickleFile(path_to_file=f'Results/Clusters_{res_id}.pkl')
    clusts = {k: v for k,v in clusts.items() if k != 'No_cluster_assigned'}
    
    p_Data_paths[data_type] = {'KEGG': p_KEGG_paths, 'PATRIC': p_PATRIC_paths}
    cluster_data[data_type] = c_data
    clusters[data_type] = clusts

In [3]:
# Plotting both databases at the same time
Dc.plotSystemsAndSubsystemsWebPage(clusters, cluster_data, p_Data_paths,
                                   plot_first_N=10, color='#dd912d',
                                   img_folder_name='iplots_imgs')

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


# Analyzing genes in pathways

In [6]:
with open('Data/Function_Annotations/KEGG/kegg_pathways.json') as json_file:
    kegg_pathways = json.load(json_file)['children']
gbk = Dc.GenomeGBK('Data/DokdoniaMED134.gbk')

kegg_dict = Dc.assignSystemsToEnzymes(kegg_pathways)
eggNOG = pd.read_excel('Data/Function_Annotations/KEGG/result_eggNOGMapper.xlsx', header=2)
ko_pathway_dict = Dc.getKEGGPathwayDict(kegg_pathways)
gene_ko_dict= Dc.getGeneKOs(eggNOG)

The KEGG pathway Peroxisome doesn't make much sense in Dokdonia. Let's look at the genes that form this pathway

In [9]:
peroxigenes = Dc.getGenesInKEGGsystem(ko_pathway_dict, gene_ko_dict,
                                      'Peroxisome', system_type='subsystem')

[(gene_id, gbk.getGeneInfo(gene_id)['product'][0]) for gene_id in peroxigenes]

[('MED134_06289', 'hydroxymethylglutaryl-CoA lyase'),
 ('MED134_05414', 'long-chain-fatty-acid-CoA ligase'),
 ('MED134_04964', 'superoxide dismutase [Cu-Zn]'),
 ('MED134_01785', 'aminotransferase class-V'),
 ('MED134_09101', 'superoxide dismutase'),
 ('MED134_10845', 'mevalonate kinase'),
 ('MED134_14141', 'isocitrate dehydrogenase')]

In [4]:
cluster_match_TPM_TC = [
    ('C0', 'C4'), ('C1', 'C5'), 
    ('C2', 'C6'), ('C3', 'C1'),
    ('C4', 'C0'), ('C5', 'C2')
]

TC_cluster_colors =     {
        'C0': '#3d03fc', 'C1': '#036bfc', 'C2': '#03adfc',
        'C3': '#fc4e03', 'C4': '#fc0303', 'C5': '#fc7703', 
        'C6': '#03fc7b', 'No cluster': 'white'
    }

cluster_colors = {
    'TC': TC_cluster_colors,
    'TPM':
    {
        'C0': TC_cluster_colors['C4'], 'C1': TC_cluster_colors['C5'],
        'C2': TC_cluster_colors['C6'], 'C3': TC_cluster_colors['C1'],
        'C4': TC_cluster_colors['C0'], 'C5': TC_cluster_colors['C2'],
        'No cluster': 'white'
    }
}
    
Dc.plotSystemsAndSubsystemsStacked(p_Data_paths, cluster_colors, 'iplots_Stacked')

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
