In [83]:
import numpy as np
import os
import os.path
from functools import reduce

import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import shortest_path
from scipy.stats import pearsonr, spearmanr
from pandas import DataFrame, concat

from anytree import AnyNode
from anytree import RenderTree
from anytree import PostOrderIter
from anytree import PreOrderIter

from pprint import pprint

import gsd.gene_sets
import gsd.immune_cells
from gsd.distance import calc_pairwise_distances
import plotly.io as pio

init_notebook_mode(connected=True)

In [84]:
categories = [{'subdir': "benchmark", 'label': 'Benchmark methods'},
              {'subdir': "general",   'label': 'Statistical methods'},
              {'subdir': "go",        'label': 'Gene Ontology methods'},
              {'subdir': "ppi",       'label': 'PPI methods'},
              {'subdir': "nlp",       'label': 'NLP methods'},
              {'subdir': "tree_path", 'label': 'Tree path lengths'}]


label_trans = {'Cosine distance GO BP description W2V': "$A$", 
               'Cosine distance GO CC description W2V': "$A$", 
               'Cosine distance GO MF description W2V': "$A$", 
               'Cosine distance over gene symbols W2V': "$A$", 
               'Cosine distance over gene trait frequency': "$A$", 
               'Cosine distance over over NCBI summary W2V': "$A$", 
               'Cosine distance over over summary W2V': "$A$",
               'GO-distance (go_type=BP, measure=Wang, combine=BMA)': "$A$", 
               'GO-distance (go_type=CC, measure=Wang, combine=BMA)': "$A$", 
               'GO-distance (go_type=MF, measure=Wang, combine=BMA)': "$A$", 
               'Jaccard distance over extended gene set': "$A$",
               'Jaccard distance over gene traits': "$A$", 
               'Jaccard distance over genes': "$A$", 
               'Kappa distance over gene traits': "$A$", 
               'Kappa distance over genes': "$A$", 
               'Minkowski distance (p=1) over gene trait frequency': "$A$",
               'Minkowski distance (p=1) over gene traits': "$A$",
               'Minkowski distance (p=1) over genes': "$A$", 
               'Minkowski distance (p=2) over gene trait frequency': "$A$", 
               'Minkowski distance (p=2) over gene traits': "$A$",
               'Minkowski distance (p=2) over genes': "$A$",
               'Overlap distance over gene traits': "$A$", 
               'Overlap distance over genes': "$A$", 
               'Pairwise path length in reference tree': "$A$", 
               'Random (uniform, (0,1))': "$A$", 
               'WM distance over gene symbols W2V': "$A$", 
               'WM distance over summary W2V': "$A$"}


def extract_scores(record):
    return record.results

def extract_exec_time(record):
    return [record.exec_time]

def extract_category_df(evaluation_data_name, category, fun=extract_scores):
    score_df = DataFrame()
    for path, subdirs, files in os.walk(os.path.join("experiment_data", category['subdir'])):
        for name in sorted(files):
            if name == "%s.json" % evaluation_data_name:
                record = gsd.gene_sets.load_gene_sets(os.path.join(path, name))
                score_df[record.name] = fun(record)
                #print(record.comparison_label)
    score_df = score_df.reindex(sorted(score_df.columns), axis=1)
    
    return category['subdir'], score_df

def extract_score_df(evaluation_data_name, fun=extract_scores):
    df_list = {name: df for name, df in [extract_category_df(evaluation_data_name, category, fun) for category in categories]}
    return concat(df_list, axis=1, names=["Category", "Metric"])

def create_heatmap(df: DataFrame, method='pearson', min_periods=1): 
    #print(df.columns)
    cor = df.corr(method, min_periods).dropna(axis=1, how='all').dropna()
    
    colorscale = [[0, '#66475e'], [1, '#ecbfe0']]
    font_colors = ['#efecee', '#3c3636']
    fig = ff.create_annotated_heatmap(cor.round(decimals=3).values,
                                      x=[metric for cat, metric in cor.columns],
                                      y=[metric for cat, metric in cor.columns])
    fig.layout.margin.l = 380
    fig.layout.margin.t = 225
    return fig

def calculate_pvalues(df, cor_fun=pearsonr):
    filtered_df = df.dropna(axis=1, how='all').dropna()._get_numeric_data()
    pvalues = DataFrame(columns=df.columns, index=df.columns)
    for r in filtered_df.columns:
        for c in filtered_df.columns:
            pvalues[r][c] = cor_fun(filtered_df[r], filtered_df[c])[1]
    return pvalues

In [86]:
for cor_method in ['pearson', 'spearman']:
    for target in ['R-HSA-8982491','R-HSA-1474290','R-HSA-373755','R-HSA-422475','immune_only']:
        fig = create_heatmap(extract_score_df(target), "pearson")

        terget_out_dir = os.path.join('plots/full', target)
        if not os.path.exists(terget_out_dir):
            os.makedirs(terget_out_dir)
    
        target_out_file = os.path.join(terget_out_dir, "%s.pdf" % cor_method)
    
        pio.write_image(fig, target_out_file, width=1550, height=700)

In [87]:
for cor_method in ['pearson', 'spearman']:
    score_df = {target: extract_score_df(target).corr(cor_method, 1) 
                for target in ['R-HSA-8982491', 'R-HSA-1474290', 'R-HSA-373755', 'R-HSA-422475', 'immune_only']}
    common_index = score_df['R-HSA-8982491'][("tree_path", "Pairwise path length in reference tree")].index

    cor = DataFrame({target : elem[("tree_path", "Pairwise path length in reference tree")].tolist() 
                     for target,elem in score_df.items()},
                    index=common_index)

    na_idx = []
    for i in range(0, cor.shape[0]):
        for j in range(0, cor.shape[1]):
            if np.isnan(cor.iloc[i,j]):
                na_idx += [(i,j)]

    cor = cor.fillna(0)

    colorscale = [[0, '#66475e'], [1, '#ecbfe0']]
    font_colors = ['#efecee', '#3c3636']
    fig = ff.create_annotated_heatmap(cor.round(decimals=3).values,
                                      x=[target for target in cor.columns],
                                      y=[metric for cat, metric in cor.index])
    fig.layout.margin.l = 380

    for i,j in na_idx:
        fig.layout.annotations[i*cor.shape[1]+j]['text'] = "NaN"
    
    #iplot(fig)

    terget_out_dir = 'plots/summary'
    if not os.path.exists(terget_out_dir):
        os.makedirs(terget_out_dir)
    
    target_out_file = os.path.join(terget_out_dir, "%s.pdf" % cor_method)
    
    pio.write_image(fig, target_out_file, width=700, height=700)

In [88]:
times = {target: extract_score_df(target,extract_exec_time)
            for target in ['R-HSA-8982491','R-HSA-1474290','R-HSA-373755','R-HSA-422475','immune_only']}

time_df = DataFrame({target : elem.iloc[0].tolist() for target, elem in times.items()},
                index=times['R-HSA-8982491'].columns)

colorscale = [[0, '#66475e'], [1, '#ecbfe0']]
font_colors = ['#efecee', '#3c3636']
fig = ff.create_annotated_heatmap(time_df.round(decimals=3).values,
                                  x=[target for target in cor.columns],
                                  y=[metric for cat, metric in cor.index])
fig.layout.margin.l = 380

for i,j in na_idx:
    fig.layout.annotations[i*time_df.shape[1]+j]['text'] = "NaN"
    

terget_out_dir = 'plots/summary'
if not os.path.exists(terget_out_dir):
    os.makedirs(terget_out_dir)
    
target_out_file = os.path.join(terget_out_dir, "times.pdf")
    
pio.write_image(fig, target_out_file, width=900, height=700)

In [89]:
score_df = {target: calculate_pvalues(extract_score_df(target)) 
            for target in ['R-HSA-8982491', 'R-HSA-1474290', 'R-HSA-373755', 'R-HSA-422475','immune_only']}
common_index = score_df['R-HSA-8982491'][("tree_path", "Pairwise path length in reference tree")].index

col = DataFrame({target : elem[("tree_path", "Pairwise path length in reference tree")].tolist() 
                     for target,elem in score_df.items()},
                    index=common_index)

na_idx = []
for i in range(0, col.shape[0]):
    for j in range(0, col.shape[1]):
        if np.isnan(col.iloc[i,j]):
            na_idx += [(i,j)]

col = col.fillna(0)

colorscale = [[0, '#66475e'], [1, '#ecbfe0']]
font_colors = ['#efecee', '#3c3636']
fig = ff.create_annotated_heatmap(col.round(decimals=4).values,
                                  x=[target for target in col.columns],
                                  y=[metric for cat, metric in col.index])
fig.layout.margin.l = 380

for i,j in na_idx:
    fig.layout.annotations[i*col.shape[1]+j]['text'] = "NaN"
    
terget_out_dir = 'plots/summary'
if not os.path.exists(terget_out_dir):
    os.makedirs(terget_out_dir)
    
target_out_file = os.path.join(terget_out_dir, "pearson_pval.pdf")
    
pio.write_image(fig, target_out_file, width=900, height=700)


invalid value encountered in subtract


invalid value encountered in reduce



In [90]:
for cor_method in [pearsonr, spearmanr]:
    score_df = {target: calculate_pvalues(extract_score_df(target),cor_fun=cor_method) 
                for target in ['R-HSA-8982491', 'R-HSA-1474290', 'R-HSA-373755', 'R-HSA-422475','immune_only']}
    common_index = score_df['R-HSA-8982491'][("tree_path", "Pairwise path length in reference tree")].index

    col = DataFrame({target : elem[("tree_path", "Pairwise path length in reference tree")].tolist() 
                         for target,elem in score_df.items()},
                        index=common_index)

    na_idx = []
    for i in range(0, col.shape[0]):
        for j in range(0, col.shape[1]):
            if np.isnan(col.iloc[i,j]):
                na_idx += [(i,j)]

    col = col.fillna(0)

    colorscale = [[0, '#66475e'], [1, '#ecbfe0']]
    font_colors = ['#efecee', '#3c3636']
    fig = ff.create_annotated_heatmap(col.round(decimals=4).values,
                                      x=[target for target in col.columns],
                                      y=[metric for cat, metric in col.index])
    fig.layout.margin.l = 380

    for i,j in na_idx:
        fig.layout.annotations[i*col.shape[1]+j]['text'] = "NaN"
    
    terget_out_dir = 'plots/summary'
    if not os.path.exists(terget_out_dir):
        os.makedirs(terget_out_dir)
    
    target_out_file = os.path.join(terget_out_dir, "%s_pval.pdf" % cor_method.__name__)
    
    pio.write_image(fig, target_out_file, width=900, height=700)


invalid value encountered in subtract


invalid value encountered in reduce

