In [1]:
import pandas as pd
import numpy as np
import os
import json
import copy
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
import Dokdonia_code as Dc
# plt.style.use('plt_styles.mplstyle')
%matplotlib inline

# Analysis of differential expression

In [19]:
# Loading counts and removing genes with low read counts across samples
min_count = 10

counts = pd.read_csv('Data/DokdoniaCounts.csv', index_col=0)
counts = counts[counts.filter(regex='^[^T]+$').columns]
conditions = [name.split('.sam')[0] for name in counts.columns]
counts.columns = conditions
counts = counts[(counts > min_count).all(1)]
counts.reset_index(level=0, inplace=True)
counts.head()

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Unnamed: 0,index,D_10_R1,D_10_R2,D_10_R3,D_18_R1,D_18_R2,D_18_R3,D_18_R4,D_25_R1,D_25_R2,...,L_18_R2,L_18_R3,L_18_R4,L_25_R1,L_25_R2,L_25_R3,L_34_R1,L_34_R2,L_34_R3,L_34_R4
0,MED134_07389,12973,11426,13251,30624,30618,41014,22449,53840,32369,...,16255,23945,20102,39444,41721,37020,39662,30983,27280,57834
1,MED134_07384,1557,1555,1552,4160,4385,5588,2940,7550,4509,...,2564,3003,2894,5419,5536,5318,5396,4751,4285,7818
2,MED134_07379,3400,3319,3273,7218,7903,9539,5040,13268,7572,...,4575,5361,4923,9089,9016,8529,10179,7617,7207,13827
3,MED134_07374,1987,2124,1404,3509,3772,4337,2602,4607,3316,...,1798,2558,2637,3586,3594,3488,4158,3096,2623,5323
4,MED134_07369,2087,2010,2111,3774,3809,4486,2901,4668,3456,...,2545,3456,3008,3705,4158,3760,5518,3563,2713,5150


## Light vs dark conditions

## Across temperatures

In [3]:
DE_results = Dc.readFromPickleFile('Results/DE_result.pkl')
len(DE_results['all'])

1930

In [2]:
# Prepara plot data
p_Data_paths, cluster_data, clusters = {}, {}, {}
data_types = ['TC', 'TPM']
res_ids = ['CLUSTER_ALL_GENES_TRANSCRIPT_CELL', 'CLUSTER_ALL_GENES_TPM']

for data_type, res_id in zip(data_types, res_ids):
    
    p_KEGG_paths = Dc.readFromPickleFile(
        path_to_file=f'Results/Permutation_analysis/p_KEGG_paths_500000_{res_id}.pkl')
    p_PATRIC_paths = Dc.readFromPickleFile(
        path_to_file=f'Results/Permutation_analysis/p_PATRIC_paths_500000_{res_id}.pkl')
    c_data = pd.read_csv(os.path.join(
        os.getcwd(),f'Results/{res_id}/Processed_Data/clust_input.tsv_processed.tsv'),
                        sep='\t', index_col='Genes')
    # Add fake column to separate datasets
    c_data.insert(4, '', [np.nan for n in range(c_data.shape[0])])

    clusts = Dc.readFromPickleFile(path_to_file=f'Results/Clusters_{res_id}.pkl')
    clusts = {k: v for k,v in clusts.items() if k != 'No_cluster_assigned'}
    
    p_Data_paths[data_type] = {'KEGG': p_KEGG_paths, 'PATRIC': p_PATRIC_paths}
    cluster_data[data_type] = c_data
    clusters[data_type] = clusts

## Using the silhouette to rank genes within cluster

We can calculate the silhouette of each gene in a cluster to rank them based on how well they fit in the cluster. We can also use the silhouette to assess how good the clusters are.

From Wikipedia:

"The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters."

The silhouette compares the mean within-cluster distance of a gene with that of inter-cluster (https://en.wikipedia.org/wiki/Silhouette_(clustering)) Described originally in the paper: https://www.sciencedirect.com/science/article/pii/0377042787901257

In [3]:
# Rank cluster genes by silhouettes
ranked_clusters = {}
data_types = ['TC', 'TPM']
res_ids = ['CLUSTER_ALL_GENES_TRANSCRIPT_CELL', 'CLUSTER_ALL_GENES_TPM']

for data_type, res_id in zip(data_types, res_ids):
   
    c_data = pd.read_csv(os.path.join(
        os.getcwd(),f'Results/{res_id}/Processed_Data/clust_input.tsv_processed.tsv'),
                        sep='\t', index_col='Genes')
    
    ranked_clusters[data_type] = Dc.rankGenesWithinClusters(clusters[data_type], c_data)

# Making Dokdonia-specific KEGG JSON

In [57]:
# Prepare txt file to read in pandas
import re

def prepareKEGGtxtFile(path_to_file, output_path):
    """
    Prepare txt file containing strain-specific pathways
    to be read by pandas
    """
    file = open(path_to_file, 'r')
    lines = file.readlines()
    new_lines = ['KEGG ID\tPathway\n']
    
    for line in lines:
        dok = re.search('dok\d{5}', line).group(0)
        ko = dok.replace('dok', 'ko')
        pathway = line.split(dok)[1].replace('\n','').strip()
        new_line = f'{ko}\t{pathway}\n'
        new_lines.append(new_line)     
    
    file.close()
    new_file = open(output_path, 'w')
    new_file.writelines(new_lines)
    new_file.close()
    
prepareKEGGtxtFile('Data/Function_Annotations/KEGG/Dokdonia_KEGG_pathways.txt',
                   'Data/Function_Annotations/KEGG/new_Dokdonia_KEGG_pathways.txt')

In [64]:
# Need to find a way to delete entries in dict while iterating, the problem is that the list gets smaller so indices don't work.
def removeEmptySystems(kegg_pathways):
    """
    Remove empty systems from kegg pathways
    """
    for i, supersystem in list(enumerate(kegg_pathways)):
        systems_to_keep = []
        for system in supersystem['children']:
            if len(system['children']) > 0:
                systems_to_keep.append(system)
        kegg_pathways[i]['children'] = systems_to_keep
    return kegg_pathways

Pathway: "09113 Global maps only" without ko number


In [None]:
# Obtain KEGG pathways only for Dokdonia sp. MED134
def pruneKEGGpathwaysForStrain(path_to_ALL_KEGG, path_to_Strain_pathways, output_path):
    """
    Remove non Strain-specific KEGG entries from the KEGG pathway database.
    Data from KEGG entry: https://www.genome.jp/dbget-bin/www_bget?gn:T03275
    """
    Dokdonia_KEGG = pd.read_csv('Data/Function_Annotations/KEGG/new_Dokdonia_KEGG_pathways.txt', delimiter='\t')
    
    strain_kos = Dokdonia_KEGG['KEGG ID'].values

    with open('Data/Function_Annotations/KEGG/kegg_pathways.json') as json_file:
        kegg_pathways = json.load(json_file)['children']

    supersystems = ['09100 Metabolism', '09120 Genetic Information Processing',
                    '09130 Environmental Information Processing', '09140 Cellular Processes']

    strain_kegg_pathways = [supersystem for supersystem in kegg_pathways
                            if supersystem['name'] in supersystems]

    for i, supersystem in list(enumerate(strain_kegg_pathways)):
        for j, system in list(enumerate(supersystem['children'])):

            subsystems_to_keep = []
            for subsystem in system['children']:

                try:
                    ko_id = Dc.extractKoID(subsystem['name'])    
                    if ko_id in strain_kos:
                        subsystems_to_keep.append(subsystem)

                except Exception:
                    print(f'Pathway: "{subsystem["name"]}" without ko number')

                strain_kegg_pathways[i]['children'][j]['children'] = subsystems_to_keep

    strain_kegg_pathways = removeEmptySystems(strain_kegg_pathways)
    
    with open('Data/Function_Annotations/KEGG/Dokdonia_KEGG_pathways.json', 'w') as outfile:
        json.dump({'name': 'ko00001', 'children': strain_kegg_pathways}, outfile)

In [4]:
# Write Excel with cluster results

with open('Data/Function_Annotations/KEGG/Dokdonia_KEGG_pathways.json') as json_file:
    kegg_pathways = json.load(json_file)['children']
    
gbk = Dc.GenomeGBK('Data/DokdoniaMED134.gbk')
    
eggNOG = pd.read_excel('Data/Function_Annotations/KEGG/result_eggNOGMapper.xlsx', header=2)
ko_pathway_dict = Dc.getKEGGPathwayDict(kegg_pathways)
gene_ko_dict= Dc.getGeneKOs(eggNOG)

patric_features = pd.read_csv('Data/Function_Annotations/PATRIC/Dokdonia_MED134_Craig_PATRIC_genome_feature.csv')
patric_pathways = pd.read_csv('Data/Function_Annotations/PATRIC/Dokdonia_MED134_PATRIC_pathways.csv')
patric_pathways_genes = pd.read_csv('Data/Function_Annotations/PATRIC/Dokdonia_MED134_Craig_PATRIC_pathways_genes.csv')

Dc.writeExcelOfClusterGenes(ranked_clusters['TPM'], 'Results/TPM_clusters_genes_pathways.xlsx',
                        gbk, patric_features, patric_pathways_genes, patric_pathways,
                        gene_ko_dict, ko_pathway_dict)

Dc.writeExcelOfClusterGenes(ranked_clusters['TC'], 'Results/TC_clusters_genes_pathways.xlsx',
                        gbk, patric_features, patric_pathways_genes, patric_pathways,
                        gene_ko_dict, ko_pathway_dict)

## Analyzing which genes are across-temperature, DE (LTR test) within clusters

In [10]:
cluster_stats = {}
for data_type in data_types:
    total_genes_in_clusters = [g for cluster in clusters[data_type].values() for g in cluster]
    DE_genes_in_clusters = np.intersect1d(DE_results['all'].index, total_genes_in_clusters)
    cluster_stats[data_type] = {'total genes': len(total_genes_in_clusters),
                               'DE fraction': len(DE_genes_in_clusters)/len(total_genes_in_clusters)}
    
pd.DataFrame(cluster_stats)                                                                            

Unnamed: 0,TC,TPM
total genes,1052.0,948.0
DE fraction,0.908745,0.931435


In [11]:
cluster_stats = {}
for data_type in data_types:
    cluster_stats[data_type] = {}
    for cluster_id, cluster in clusters[data_type].items():
        cluster_stats[data_type][cluster_id] = {
            'total genes': len(cluster),
            'DE fraction': len(np.intersect1d(DE_results['all'].index,cluster))/len(cluster)
        }

In [26]:
Dc.getMetaMatrix(counts_T)

Unnamed: 0,lighting,temperature,replicate
D_10_R1,D,10,R1
D_10_R2,D,10,R2
D_10_R3,D,10,R3
D_34_R1,D,34,R1
D_34_R2,D,34,R2
D_34_R3,D,34,R3
D_34_R4,D,34,R4


In [18]:
cluster_stats['TC']['C6']

{'total genes': 75, 'DE fraction': 0.7733333333333333}

## Evaluating fold change in clusters Clusters $C_1$, $C_3$ and $C_2$, $C_5$.

Basically, I get that these genes are not DE according to  the Wald test in DeSeq2. However, data clearly shows a patter... DeSeq2 being very conservative?

In [62]:
p_value_cutoff = 0.05
fold_cutoff = k = 0.1

T1, T2 = 34, 25
counts_T = counts[counts.filter(regex=f'{T1}|{T2}|index').columns]
counts_T_C3 = counts_T[counts_T['index'].isin(clusters['TC']['C3'])]

T_res, T_stats = Dc.runDEtest(counts_T_C3, test='Wald', alpha=p_value_cutoff,
                                  formula='~ temperature', log2fold_cutoff=k)

In [63]:
T_res

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,index
MED134_04624,1239.139427,0.197042,0.015828,6.131097,8.727541e-10,1.274221e-08,MED134_04624
MED134_03499,2219.167085,0.185841,0.0134,6.406053,1.493354e-10,2.725371e-09,MED134_03499
MED134_07626,853.915948,0.4187,0.026521,12.017057,2.890864e-33,2.1103300000000003e-31,MED134_07626
MED134_10595,689.031967,0.206668,0.014264,7.478255,7.53159e-14,2.74903e-12,MED134_10595
MED134_11826,704.764486,0.132876,0.010406,3.159426,0.001580803,0.0192331,MED134_11826
MED134_11831,1004.845845,0.192995,0.013938,6.672189,2.520156e-11,6.132379e-10,MED134_11831


In [None]:
# Plotting both databases at the same time
Dc.plotSystemsAndSubsystemsWebPage(clusters, cluster_data, p_Data_paths,
                                   plot_first_N=10, color='#dd912d',
                                   img_folder_name='iplots_imgs')

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


# Analyzing genes in pathways

What you have to do is to remove KEGG pathways that don't make sense in a prokaryote... most genes have redundant pathways assigned. So, remove Peroxisome, and perhaps other ones, like some signaling pathways.

In [84]:
gene_list = clusters['TC']['C3']


for gene_id in gene_list:
    g_info = gbk.getGeneInfo(gene_id)
    if 'note' in g_info.keys():
        if 'heat shock' in g_info['note'][0].lower():
            print(gene_id)
        

In [3]:
with open('Data/Function_Annotations/KEGG/kegg_pathways.json') as json_file:
    kegg_pathways = json.load(json_file)['children']
gbk = Dc.GenomeGBK('Data/DokdoniaMED134.gbk')

kegg_dict = Dc.assignSystemsToEnzymes(kegg_pathways)
eggNOG = pd.read_excel('Data/Function_Annotations/KEGG/result_eggNOGMapper.xlsx', header=2)
ko_pathway_dict = Dc.getKEGGPathwayDict(kegg_pathways)
gene_ko_dict= Dc.getGeneKOs(eggNOG)

The KEGG pathway Peroxisome doesn't make much sense in Dokdonia. Let's look at the genes that form this pathway

In [4]:
peroxigenes = Dc.getGenesInKEGGsystem(ko_pathway_dict, gene_ko_dict,
                                      'Peroxisome', system_type='subsystem')

[(gene_id, gbk.getGeneInfo(gene_id)['product'][0]) for gene_id in peroxigenes]

[('MED134_06289', 'hydroxymethylglutaryl-CoA lyase'),
 ('MED134_05414', 'long-chain-fatty-acid-CoA ligase'),
 ('MED134_04964', 'superoxide dismutase [Cu-Zn]'),
 ('MED134_01785', 'aminotransferase class-V'),
 ('MED134_09101', 'superoxide dismutase'),
 ('MED134_10845', 'mevalonate kinase'),
 ('MED134_14141', 'isocitrate dehydrogenase')]

In [11]:
subsystem_name = 'Peroxisome'
subsystem_genes = Dc.getGenesInKEGGsystem(ko_pathway_dict, gene_ko_dict,
                                          subsystem_name, system_type='subsystem')

total_genes = [(gene_id, gbk.getGeneInfo(gene_id)['product'][0]) 
               for gene_id in subsystem_genes]
genes_in_cluster = [(gene_id, gbk.getGeneInfo(gene_id)['product'][0]) 
                    for gene_id in subsystem_genes if gene_id in clusters['TC']['C0']]

print(total_genes)
print(genes_in_cluster)

[('MED134_06289', 'hydroxymethylglutaryl-CoA lyase'), ('MED134_05414', 'long-chain-fatty-acid-CoA ligase'), ('MED134_04964', 'superoxide dismutase [Cu-Zn]'), ('MED134_01785', 'aminotransferase class-V'), ('MED134_09101', 'superoxide dismutase'), ('MED134_10845', 'mevalonate kinase'), ('MED134_14141', 'isocitrate dehydrogenase')]
[]


In [4]:
cluster_match_TPM_TC = [
    ('C0', 'C4'), ('C1', 'C5'), 
    ('C2', 'C6'), ('C3', 'C1'),
    ('C4', 'C0'), ('C5', 'C2')
]

TC_cluster_colors =     {
        'C0': '#3d03fc', 'C1': '#036bfc', 'C2': '#03adfc',
        'C3': '#fc4e03', 'C4': '#fc0303', 'C5': '#fc7703', 
        'C6': '#03fc7b', 'No cluster': 'white'
    }

cluster_colors = {
    'TC': TC_cluster_colors,
    'TPM':
    {
        'C0': TC_cluster_colors['C4'], 'C1': TC_cluster_colors['C5'],
        'C2': TC_cluster_colors['C6'], 'C3': TC_cluster_colors['C1'],
        'C4': TC_cluster_colors['C0'], 'C5': TC_cluster_colors['C2'],
        'No cluster': 'white'
    }
}
    
Dc.plotSystemsAndSubsystemsStacked(p_Data_paths, cluster_colors, 'iplots_Stacked')

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
