In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
from scipy import stats

In [None]:
data_path = "real_data_experiment_files/outputs"
export_path = "real_data_experiment_files/figures"

In [None]:
def get_model_data(model_name):
    flux_data = {}
    group_data = {}
    leave_one_out_predictions = {}
    full_predictions = {}
    pathway_to_reactions = {}
    
    files = os.listdir(data_path)
    # search for a directory with the model's name
    if model_name not in files:
        raise ValueError(f"Model {model_name} not found in data path {data_path}")
    model_dir = os.path.join(data_path, model_name)
    model_files = os.listdir(model_dir)
    print(model_files)
    # assert "pathways_to_rxns.csv" in model_files
    # pathway information is directly in model_files, grap that
    # a csv with pathway id and then variable length sequence of reaction ids, read as a dict mapping pathway id to list of reaction ids
    # pathways_to_rxns = {}
    # df = pd.read_csv(os.path.join(os.path.join(data_path, model_name, "pathways_to_rxns.csv")), header=None)
    # for index, row in df.iterrows():
    #     pathway_id = row[0]
    #     # separator is ;
    #     reaction_ids = row[1:].dropna()
    #     # separate by ;
    #     reaction_ids = reaction_ids.astype(str).str.split(";").explode().tolist()
    #     pathways_to_rxns[pathway_id] = reaction_ids
    # assert "groups" in model_files
    # now read group information from the "group" subdirectory. Each file name there corresponds to a key, and the read dataframe to the values
    # group_files = os.listdir(os.path.join(data_path, model_name, "groups"))
    # for f in group_files:
    #     if f.endswith(".csv"):
    #         key = f[:-4]
    #         group_data[key] = pd.read_csv(os.path.join(data_path, model_name, "groups", f), index_col=0)
    # assert "fluxes" in model_files and "predictions" in model_files
    # assert "leave_one_out" in os.listdir(os.path.join(data_path, model_name, "predictions"))
    # assert "full" in os.listdir(os.path.join(data_path, model_name, "predictions"))
    # next are the flux data and flux predictions. Structure is as follows: "fluxes" subdirectory has csv files, the names are keys
    # and the dataframes are the values.
    # Predictions have leave_one_out and full subdirectories, with each key of "fluxes" matching a subdirectory within those two.
    # make sure they match, and fill both leave_one_out_predictions and full_predictions, and flux_data accordingly (so predictions dicts
    # have one more level, of the method names (the csvs)).
    # If there isn't a match, make sure it's something missing from the predictions only, and rint that warning.
    
    flux_files = os.listdir(os.path.join(data_path, model_name, "fluxes"))
    for f in flux_files:
        print(f)
        if f.endswith(".csv"):
            key = f[:-4]
            flux_data[key] = pd.read_csv(os.path.join(data_path, model_name, "fluxes", f), index_col=0)
            # check for predictions
            loo_dir = os.path.join(data_path, model_name, "predictions", "leave_one_out", key)
            full_dir = os.path.join(data_path, model_name, "predictions", "full", key)
            if os.path.isdir(loo_dir):
                method_files = os.listdir(loo_dir)
                leave_one_out_predictions[key] = {}
                for method_file in method_files:
                    if method_file.endswith(".csv"):
                        method_name = method_file[:-4]
                        leave_one_out_predictions[key][method_name] = pd.read_csv(os.path.join(loo_dir, method_file), index_col=0)
            else:
                print(f"Warning: No leave_one_out predictions found for key {key} in model {model_name}")
            if os.path.isdir(full_dir):
                method_files = os.listdir(full_dir)
                full_predictions[key] = {}
                for method_file in method_files:
                    if method_file.endswith(".csv"):
                        method_name = method_file[:-4]
                        full_predictions[key][method_name] = pd.read_csv(os.path.join(full_dir, method_file), index_col=0)
            else:
                print(f"Warning: No full predictions found for key {key} in model {model_name}")
    return flux_data, leave_one_out_predictions, full_predictions

In [None]:
def compute_loo_correlations(flux_data, predictions, abs_values):
    # Go over each data source in flux_data. Compute per-column (reaction) and per row (sample) Spearman correlations between each method's predictions on the matching data and the true fluxes. Store the results in a dataframe with columns: method, data, axis, correlation.
    entries = []
    for data_source in flux_data:
        print("data source:", data_source)
        if data_source.replace("_", " ") not in relevant_loos:
            print("Skipping data source")
            continue
        true_fluxes = flux_data[data_source]
        # check if predictions exist
        if data_source not in predictions:
            print(f"Warning: No predictions for data source {data_source}")
            continue
        predictions_for_data = predictions[data_source]
        for method in predictions_for_data:
            # ignore raw input here
            if method == "RawInput" or method == "Reference Values":
                continue
            predicted_fluxes = predictions_for_data[method].fillna(0)
            if predicted_fluxes.shape[1] == 0:
                print("Empty predictions for method {}, data source {}".format(method, data_source))
                continue
            # align indices and columns
            common_indices = true_fluxes.index.intersection(predicted_fluxes.index)
            common_columns = true_fluxes.columns.intersection(predicted_fluxes.columns)
            # print(f"Computing correlations for data source: {data_source}, method: {method}")
            # print(f"Common indices: {len(common_indices)}, Common columns: {len(common_columns)}")
            # print(f"True fluxes shape: {true_fluxes.shape}, Predicted fluxes shape: {predicted_fluxes.shape}")
            # print index column samples
            # print(f"True fluxes indices: {true_fluxes.index.tolist()[:5]}, Predicted fluxes indices: {predicted_fluxes.index.tolist()[:5]}")
            # print(f"True fluxes columns: {true_fluxes.columns.tolist()[:5]}, Predicted fluxes columns: {predicted_fluxes.columns.tolist()[:5]}")
            assert len(common_indices) > 0, f"No common indices between true fluxes and predictions for data source {data_source} using method {method}"
            assert len(common_columns) > 0, f"No common columns between true fluxes and predictions for data source {data_source} using method {method}"
            true_aligned = true_fluxes.loc[common_indices, common_columns]
            predicted_aligned = predicted_fluxes.loc[common_indices, common_columns]
            if abs_values:
                true_aligned = abs(true_aligned)
                predicted_aligned = abs(predicted_aligned)
            # compute per-column correlations
            col_correlations = dict()
            p_values = dict()
            for reaction in predicted_aligned.columns:
                if (~predicted_aligned[reaction].isna()).sum() <= 3:
                    continue
                corr_value, p_value = stats.spearmanr(predicted_aligned[reaction], true_aligned[reaction], nan_policy='omit')
                col_correlations[reaction] = corr_value
                p_values[reaction] = p_value
            for reaction, corr_value in col_correlations.items():
                entries.append({
                    "method": method,
                    "data": data_source.replace("_", " "),
                    "axis": "reaction",
                    "correlation": corr_value,
                    "p-val": p_values[reaction]
                    
                })
            # compute per-row correlations
            row_correlations = dict()
            p_values = dict()
            for sample in predicted_aligned.index:
                if (~predicted_aligned.loc[sample].isna()).sum() <= 3:
                    continue
                corr_value, p_value = stats.spearmanr(predicted_aligned.loc[sample], true_aligned.loc[sample], nan_policy='omit')
                row_correlations[sample] = corr_value
                p_values[sample] = p_value
            print("data source {}, method {}, samples mean p-value {:.2e}, min p-value {:.2e}, max p-value {:.2e}".format(data_source, method, 
                                                                                                                  np.mean(list(p_values.values())), np.min(list(p_values.values())), np.max(list(p_values.values()))))
            for sample, corr_value in row_correlations.items():
                entries.append({
                    "method": method,
                    "data": data_source.replace("_", " "),
                    "axis": "sample",
                    "correlation": corr_value,
                    "p-val": p_values[sample]
                })
    return pd.DataFrame(entries)

In [None]:
# for full correlations, instead of matching data and predictions, we'll compute correlations across all pairs of data-prediction sources, with columns for the entries corresponding to both the input data and the target data
def compute_full_correlations(flux_data, predictions, abs_values):
    # Go over each data source in flux_data. Compute per-column (reaction) and per row (sample) Spearman correlations between each method's predictions on the matching data and the true fluxes. Store the results in a dataframe with columns: method, data, axis, correlation.
    
    entries = []
    predictions = {k.replace("_", " "): v for (k, v) in predictions.items()}
    flux_data = {k.replace("_", " "): v for (k, v) in flux_data.items()}
    targets = sorted(set(flux_data.keys()).intersection(relevant_targets))
    inputs = sorted(set(predictions.keys()).intersection(relevant_inputs))
    for target_source in targets:
        print("target source:", target_source)
        for input_source in inputs:
            print("input source:", input_source)
            true_fluxes = flux_data[target_source]
            predicted_fluxes = predictions[input_source]
            for method in predicted_fluxes:
                print("method:", method)
                predicted_fluxes_method = predicted_fluxes[method].fillna(0)
                # align indices and columns
                common_indices = true_fluxes.index.intersection(predicted_fluxes_method.index)
                common_columns = true_fluxes.columns.intersection(predicted_fluxes_method.columns)
                print(f"Computing correlations for input source: {input_source}, target_source: {target_source}, method: {method}")
                # print(f"Common indices: {len(common_indices)}, Common columns: {len(common_columns)}")
                # print(f"True fluxes shape: {true_fluxes.shape}, Predicted fluxes shape: {predicted_fluxes_method.shape}")
                # print(f"True fluxes indices: {true_fluxes.index.tolist()[:5]}, Predicted fluxes indices: {predicted_fluxes_method.index.tolist()[:5]}")
                # print(f"True fluxes columns: {true_fluxes.columns.tolist()[:5]}, Predicted fluxes columns: {predicted_fluxes_method.columns.tolist()[:5]}")
                if not len(common_indices) or not len(common_columns):
                    print(f"Skipping input source: {input_source}, target_source: {target_source}, method: {method} due to no common indices or columns")
                    continue
                # assert len(common_indices) > 0, f"No common indices between true fluxes for target {target_source} and predictions for input {input_source} using method {method}"
                # assert len(common_columns) > 0, f"No common columns between true fluxes for target {target_source} and predictions for input {input_source} using method {method}"
                true_aligned = true_fluxes.loc[common_indices, common_columns]
                predicted_aligned = predicted_fluxes_method.loc[common_indices, common_columns]
                if abs_values:
                    true_aligned = abs(true_aligned)
                    predicted_aligned = abs(predicted_aligned)
                # compute per-column correlations
                col_correlations = dict()
                p_values = dict()
                for reaction in predicted_aligned.columns:
                    if len(predicted_aligned[reaction]) <= 3:
                        continue
                    corr_value, p_value = stats.spearmanr(predicted_aligned[reaction], true_aligned[reaction], nan_policy='omit')
                    p_values[reaction] = p_value
                    col_correlations[reaction] = corr_value
                    p_values[reaction] = p_value
                    # 
                    # if target_source == "activities" and input_source == "activities" and method == "RawInput":
                    #     print(f"Reaction: {reaction}, Correlation: {corr_value}")

                # col_correlations = predicted_aligned.corrwith(true_aligned, axis=0, method='spearman')
                # print(f"Computing correlations for input: {input_source}, target: {target_source}, method: {method}")
                # print(f"Cleaned names: input: {input_source.replace('_', ' ')}, target: {target_source.replace('_', ' ')}")
                for reaction, corr_value in col_correlations.items():
                    entries.append({
                        "method": method,
                        "input data": input_source.replace("_", " "),
                        "target data": target_source.replace("_", " "),
                        "axis": "reaction",
                        "correlation": corr_value,
                        "p-val": p_values[reaction]
                    })
                # compute per-row correlations
                row_correlations = dict()
                p_values = dict()
                for sample in predicted_aligned.index:
                    corr_value, p_value = stats.spearmanr(predicted_aligned.loc[sample], true_aligned.loc[sample], nan_policy='omit')
                    row_correlations[sample] = corr_value
                    p_values[sample] = p_value
                # row_correlations = predicted_aligned.corrwith(true_aligned, axis=1, method='spearman')
                print("target source {}, input source {}, method {}, samples mean p-value {:.2e}, min p-value {:.2e}, max p-value {:.2e}".format(target_source, input_source, method, 
                                                                                                                      np.mean(list(p_values.values())), np.min(list(p_values.values())), np.max(list(p_values.values()))))
                for sample, corr_value in row_correlations.items():
                    entries.append({
                        "method": method,
                        "input data": input_source.replace("_", " "),
                        "target data": target_source.replace("_", " "),
                        "axis": "sample",
                        "p-val": p_values[sample],
                        "correlation": corr_value
                    })
    return pd.DataFrame(entries)

In [None]:
def plot_full_corrs(df, model_name):
    # Create two multi-plots, each one a matrix of subplots. One for sample-based correlations and one for reaction-based correlations. 
    # Each subplot will have a boxenplot with hue as method and y as correlation, and data using the corresponding input-target pair.
    
    # start with reaction plots
    rxns_full_correlation_df = df[df['axis'] == 'reaction']
    targets = sorted(set(rxns_full_correlation_df['target data'].unique()).intersection(relevant_targets))
    inputs = sorted(set(rxns_full_correlation_df['input data'].unique()).intersection(relevant_inputs))
    print("Targets: ", targets)
    print("Inputs: ", inputs)
    # subplots = plt.figure(figsize=(5 * len(inputs), 3 * len(targets)))
    # axes = {}
    for i, target_data in enumerate(sorted(targets)):
        for j, input_data in enumerate(sorted(inputs)):
            plt.figure(figsize=(8, 6))
            # axes[(input_data, target_data)] = plt.subplot2grid((len(targets), len(inputs)), (i, j), fig=subplots)
            # plt.title(f"Input: {input_data}\nTarget: {target_data}")
            plt.xlabel("Method")
            data = rxns_full_correlation_df[(rxns_full_correlation_df['input data'] == input_data) & (rxns_full_correlation_df['target data'] == target_data)]
            # boxenplot colored by method
            if len(data) > 10:
                sns.boxenplot(data=data.sort_values(by=['method']), hue='method', y='correlation')#, ax=axes[(input_data, target_data)])
            else:
                sns.barplot(data=data.sort_values(by=['method']), hue='method', y='correlation')#, ax=axes[(input_data, target_data)])                
            # no legend
            # try:
            #     axes[(input_data, target_data)].get_legend().remove()
            # except Exception as e:
            #     pass
            # bbox = axes[(input_data, target_data)].get_tightbbox(subplots.canvas.get_renderer())
            filename =  "{}_rxns_full_{}_to_{}".format(model_name, input_data, target_data) + ".png"
            print(filename)
            # plt.gcf().savefig(os.path.join(export_path, filename))
            plt.show()

            # plt.xticks(rotation=45)
    # make sure there's only one global legend, outside to the right of the plot area
    # handles, labels = subplots.gca().get_legend_handles_labels()
    # subplots.legend(handles, labels, title="Method", bbox_to_anchor=(1.05, 0.5), loc='center left')
    # plt.suptitle("{} reaction-based full correlations".format(model_name), fontsize=16)
    # plt.tight_layout()
    # plt.show()
            
    # now sample plots
    samples_full_correlation_df = df[df['axis'] == 'sample']
    targets = sorted(set(samples_full_correlation_df['target data'].unique()).intersection(relevant_targets))
    inputs = sorted(set(samples_full_correlation_df['input data'].unique()).intersection(relevant_inputs))
    # subplots = plt.figure(figsize=(5 * len(inputs), 3 * len(targets)))
    # axes = {}
    for i, target_data in enumerate(sorted(targets)):
        for j, input_data in enumerate(sorted(inputs)):
            plt.figure(figsize=(8, 6))
            # axes[(input_data, target_data)] = plt.subplot2grid((len(samples_full_correlation_df['target data'].unique()), len(samples_full_correlation_df['input data'].unique())), (i, j), fig=subplots)
            # plt.title(f"Input: {input_data}\nTarget: {target_data}")
            plt.xlabel("Method")
            data = samples_full_correlation_df[(samples_full_correlation_df['input data'] == input_data) & (samples_full_correlation_df['target data'] == target_data)]
            print(data)
            # boxenplot colored by method
            if len(data) > 10:
                sns.boxenplot(data=data.sort_values(by=['method']), hue='method', y='correlation')#, ax=axes[(input_data, target_data)])
            else:
                sns.barplot(data=data.sort_values(by=['method']), hue='method', y='correlation')#, ax=axes[(input_data, target_data)])                

            # bbox = axes[(input_data, target_data)].get_tightbbox(subplots.canvas.get_renderer())
            filename =  "{}_samples_full_{}_to_{}".format(model_name, input_data, target_data) + ".png"
            print(filename)
            # plt.gcf().savefig(os.path.join(export_path, filename))
            plt.show()
            # no legend
            # try:
            #     axes[(input_data, target_data)].get_legend().remove()
            # except Exception as e:
            #     pass
            # plt.xticks(rotation=45)
    # make sure there's only one global legend, outside to the right of the plot area
    # handles, labels = subplots.gca().get_legend_handles_labels()
    # subplots.legend(handles, labels, title="Method", bbox_to_anchor=(1.05, 0.5), loc='center left')
    # plt.suptitle("{} sample-based full correlations".format(model_name), fontsize=16)
    # plt.tight_layout()
    # plt.show()

In [None]:
def filter_pathways(pathways_to_rxns, name_filter, min_length):
    # filter pathways based on name_filter function and min_length
    filtered_pathways = {}
    for pathway, reactions in pathways_to_rxns.items():
        if name_filter(pathway) and len(reactions) >= min_length:
            filtered_pathways[pathway] = reactions
    return filtered_pathways

In [None]:
relevant_loos = ['activities', 'intracellular', 'fluxmapped activities', 'intracellular ACH-000019', 'intracellular ACH-000681', 'fold intracellular', 'NCI60 exchanges', 
                 'joint intracellular and exchanges ACH-000019', 'joint intracellular and exchanges ACH-000681']
relevant_inputs = ['activities', 'fluxmapped activities', 'NCI60 exchanges']
relevant_targets = ['NCI60 exchanges', 'intracellular ACH-000019', 'intracellular ACH-000681', 'intracellular', 'fold intracellular', 'joint intracellular and exchanges ACH-000019', 'joint intracellular and exchanges ACH-000681']
relevant_methods = ['FBA', 'FBApro', 'FBAproLowMid', 'FBAproHighMid', 'iMAT', 'GIMME', 'Reference Values']


In [None]:
recon1_flux_data, recon1_loo_predictions, recon1_full_predictions = get_model_data("recon1")

In [None]:
sns.set(font_scale=1.5)

# Recon1

## Leave-one-out predictions

In [None]:
loo_correlation_df = compute_loo_correlations(recon1_flux_data, recon1_loo_predictions, abs_values=False)

In [None]:
# Create a multi-plot with two subplots, One for sample-based correlations and one for reaction-based correlations. Each one
# will have a boxenplot with x as data y as correlation and hue as method.
sns.set_style('whitegrid')
g = sns.catplot(
    data=loo_correlation_df.sort_values(by=['data', 'method']),
    x="data",
    y="correlation",
    hue="method",
    col="axis",
    kind="boxen",
    height=5,
    aspect=1.5,
    sharey=False
)
g.set_axis_labels("Data Source", "Spearman Correlation")
g.set_titles(col_template="Recon1 {col_name} signed correlations")
# textwrap x tick labels, no rotation (remember it's a multiplot, do once per subplot)
for ax in g.axes.flat:
    labels = [textwrap.fill(label.get_text(), 15) for label in ax.get_xticklabels()]
    ax.set_xticklabels(labels)
g._legend.set_title("Method")
g._legend.set_bbox_to_anchor((1.1, 0.75))
plt.tight_layout()

filename =  "recon1_loo_signed.png"
g.savefig(os.path.join(export_path, filename))

plt.show()


In [None]:
# Create a multi-plot with two subplots, One for sample-based correlations and one for reaction-based correlations. Each one
# will have a boxenplot with x as data y as correlation and hue as method.
sns.set_style('whitegrid')
g = sns.catplot(
    data=loo_correlation_df.sort_values(by=['data', 'method']),
    x="data",
    y="correlation",
    hue="method",
    col="axis",
    kind="bar",
    height=5,
    aspect=1.5,
    sharey=False
)
g.set_axis_labels("Data Source", "Spearman Correlation")
g.set_titles(col_template="Recon1 {col_name} signed correlations")
# textwrap x tick labels, no rotation (remember it's a multiplot, do once per subplot)
for ax in g.axes.flat:
    labels = [textwrap.fill(label.get_text(), 15) for label in ax.get_xticklabels()]
    ax.set_xticklabels(labels)
g._legend.set_title("Method")
g._legend.set_bbox_to_anchor((1.1, 0.75))
plt.tight_layout()

filename =  "recon1_loo_signed.png"
g.savefig(os.path.join(export_path, filename))

plt.show()


In [None]:
# Create a multi-plot with two subplots, One for sample-based correlations and one for reaction-based correlations. Each one
# will have a boxenplot with x as data y as correlation and hue as method.
sns.set(style="darkgrid", font_scale=2)
filter = (loo_correlation_df['axis'] == 'sample') & (
    (loo_correlation_df['data'].str.contains('joint')) | (loo_correlation_df['data'].str.contains('NCI60')))
replaced = loo_correlation_df.copy()
replaced['data'] = replaced['data'].str.replace("joint intracellular and exchanges", "Joined fluxes")
replaced['data'] = replaced['data'].str.replace("ACH-000681", "A549").str.replace("ACH-000019", "MCF7")
replaced['method'] = replaced['method'].str.replace("LowMid", "Partial").str.replace("HighMid", "Fixed")

replaced = replaced.loc[~replaced['method'].str.contains('Scale')]

g = sns.catplot(
    data=replaced.loc[filter].sort_values(by=['data', 'method']),
    x="data",
    y="correlation",
    hue="method",
    kind="bar",
    height=7,
    aspect=1.5,
    sharey=True
)
# g.set_axis_labels("Data Source", "Per-Sample Spearman Correlation")
plt.ylabel("Per-Sample Spearman Correlation")
plt.xlabel("")
g.set_titles(col_template="Core recon {col_name} signed correlations")
# textwrap x tick labels, no rotation (remember it's a multiplot, do once per subplot)
for ax in g.axes.flat:
    labels = [textwrap.fill(label.get_text(), 15) for label in ax.get_xticklabels()]
    ax.set_xticklabels(labels)
g._legend.set_title("Method")
g._legend.set_bbox_to_anchor((1.2, 0.7))
plt.tight_layout()

filename =  "recon1_loo.png"
g.savefig(os.path.join(export_path, filename))

plt.show()


In [None]:
# Create a multi-plot with two subplots, One for sample-based correlations and one for reaction-based correlations. Each one
# will have a boxenplot with x as data y as correlation and hue as method.
sns.set(style="darkgrid", font_scale=2)
filter = (loo_correlation_df['axis'] == 'sample') & (
    (loo_correlation_df['data'].str.contains('joint')) | (loo_correlation_df['data'].str.contains('NCI60')))
replaced = loo_correlation_df.copy()
replaced['data'] = replaced['data'].str.replace("joint intracellular and exchanges", "Exchange & Intracellular")
replaced['data'] = replaced['data'].str.replace("ACH-000681", "A549").str.replace("ACH-000019", "MCF7")
replaced['method'] = replaced['method'].str.replace("LowMid", "Partial").str.replace("HighMid", "Fixed").str.replace("Reference Values", "Mid-bound Benchmark")

replaced = replaced.loc[~replaced['method'].str.contains('Scale') & ~replaced['method'].str.contains('Clip')]

replaced['method'] = pd.Categorical(replaced['method'], 
                                ["FBApro", 
                                 "FBAproPartial", 
                                 "FBAproFixed",
                                 "FBA",
                                 "MoMA",
                                 "iMAT",
                                ])


g = sns.catplot(
    data=replaced.loc[filter].sort_values(by=['data', 'method']),
    x="data",
    y="correlation",
    hue="method",
    kind="bar",
    height=8,
    aspect=1.5,
    sharey=True
)
# g.set_axis_labels("Data Source", "Per-Sample Spearman Correlation")
plt.ylabel("Per-Sample Spearman Correlation")
plt.xlabel("")
g.set_titles(col_template="Core recon {col_name} signed correlations")
# textwrap x tick labels, no rotation (remember it's a multiplot, do once per subplot)
for ax in g.axes.flat:
    labels = [textwrap.fill(label.get_text(), 25) for label in ax.get_xticklabels()]
    ax.set_xticklabels(labels)
g._legend.set_title("Method")
g._legend.set_bbox_to_anchor((1.12, 0.7))
plt.tight_layout()

filename =  "recon1_loo.png"
g.savefig(os.path.join(export_path, filename))

plt.show()


## Full predictions

In [None]:
full_correlation_df = compute_full_correlations(recon1_flux_data, recon1_full_predictions, abs_values=False)

In [None]:
input_target_pairs = [('(Mapped Activities\n->\nNCI60 exchanges)'), 
                      # ('(Mapped Activities\n->\nFluxes MCF7)'),
                      # ('(Mapped Activities\n->\nFluxes A549)'),
                      ('(Mapped Activities\n->\nJoined Fluxes MCF7)'),
                      ('(Mapped Activities\n->\nJoined Fluxes A549)'),                      
                      ('(Activities\n->\nJoined Fluxes MCF7)'),
                      ('(Activities\n->\nJoined Fluxes A549)'),                      
                      # ('(NCI60 exchanges\n->\nFluxes A549)'),
                      # ('(NCI60 exchanges\n->\nFluxes MCF7)'),
                     ]
# full_correlation_df['input data'].unique(), full_correlation_df['target data'].unique() 

In [None]:
data = full_correlation_df.copy()
data['target data'] = data['target data'].str.replace("intracellular ACH-000019", "Fluxes MCF7").str.replace(
    "intracellular ACH-000681", "Fluxes A549").str.replace(
    "joint intracellular and exchanges ACH-000019", "Joined Fluxes MCF7").str.replace(
    "joint intracellular and exchanges ACH-000681", "Joined Fluxes A549")
data['input data'] = data['input data'].str.replace("fluxmapped activities", "Mapped Activities").str.replace("activities", "Activities")
input_target_values = data['input data'] + '\n->\n' + data['target data']
data['input target pair'] = input_target_values
print(data['input target pair'].unique())
filter = input_target_values.str.contains('|'.join(input_target_pairs))
data = data.loc[filter].loc[data['method'] != 'GIMME'] 

data = data.loc[~data['method'].str.contains('Scale')]

# Create a multi-plot with two subplots, One for sample-based correlations and one for reaction-based correlations. Each one
# will have a boxenplot with x as data y as correlation and hue as method.
# sns.set_style('whitegrid')
g = sns.catplot(
    data=data.loc[data['axis'] == 'sample'].sort_values(by=['method', 'input target pair']),
    x="input target pair",
    y="correlation",
    hue="method",
    # col="axis",
    kind="bar",
    height=7,
    aspect=1.5,
    sharey=False
)
g.set_axis_labels("Input Source -> Target Source", "Per-Sample Spearman Correlation")
# g.set_titles(col_template="Recon1 signed correlations")
# textwrap x tick labels, no rotation (remember it's a multiplot, do once per subplot)
# for ax in g.axes.flat:
#     labels = [textwrap.fill(label.get_text(), 15) for label in ax.get_xticklabels()]
#     ax.set_xticklabels(labels)
g._legend.set_title("Method")
g._legend.set_bbox_to_anchor((1.2, 0.7))
plt.tight_layout()

filename =  "recon1_mapping.png"
g.savefig(os.path.join(export_path, filename))

plt.show()


In [None]:
input_target_pairs = [('(Mapped Activities\n->\nNCI60 exchanges)'), 
                      # ('(Mapped Activities\n->\nFluxes MCF7)'),
                      # ('(Mapped Activities\n->\nFluxes A549)'),
                      ('(Mapped Activities\n->\nExchange & Intracellular MCF7)'),
                      ('(Mapped Activities\n->\nExchange & Intracellular A549)'),                      
                      # ('(Activities\n->\nJoined Fluxes MCF7)'),
                      # ('(Activities\n->\nJoined Fluxes A549)'),                      
                      # ('(NCI60 exchanges\n->\nFluxes A549)'),
                      # ('(NCI60 exchanges\n->\nFluxes MCF7)'),
                     ]
# full_correlation_df['input data'].unique(), full_correlation_df['target data'].unique() 

In [None]:
data = full_correlation_df.copy()
data['target data'] = data['target data'].str.replace("intracellular ACH-000019", "Fluxes MCF7").str.replace(
    "intracellular ACH-000681", "Fluxes A549").str.replace(
    "joint intracellular and exchanges ACH-000019",  "Exchange & Intracellular MCF7").str.replace(
    "joint intracellular and exchanges ACH-000681", "Exchange & Intracellular A549")
data['input data'] = data['input data'].str.replace("fluxmapped activities", "Mapped Activities").str.replace("activities", "Activities")
input_target_values = data['input data'] + '\n->\n' + data['target data']
data['input target pair'] = input_target_values
print(data['input target pair'].unique())
filter = input_target_values.str.contains('|'.join(input_target_pairs))
data = data.loc[filter].loc[data['method'] != 'GIMME'] 
data['method'] = data['method'].str.replace("LowMid", "Partial").str.replace("HighMid", "Fixed").str.replace("Reference Values", "Mid-bound Benchmark")

data = data.loc[~data['method'].str.contains('Scale') & ~data['method'].str.contains('Clip')]
data = data.loc[~data['method'].str.contains('Scale') & ~data['method'].str.contains('Clip')]

data['method'] = pd.Categorical(data['method'], 
                                ["FBApro", 
                                 "FBAproPartial", 
                                 "FBAproFixed",
                                 "FBA",
                                 "MoMA",
                                 "iMAT",
                                "Mid-bound Benchmark",
                                ])

# Create a multi-plot with two subplots, One for sample-based correlations and one for reaction-based correlations. Each one
# will have a boxenplot with x as data y as correlation and hue as method.
# sns.set_style('whitegrid')
sns.set(style="darkgrid", font_scale=2)
g = sns.catplot(
    data=data.loc[data['axis'] == 'sample'].sort_values(by=['method', 'input target pair']),
    x="input target pair",
    y="correlation",
    hue="method",
    # col="axis",
    kind="bar",
    height=8,
    aspect=1.5,
    sharey=False
)
# g.set_axis_labels("Input Source -> Target Source", "Per-Sample Spearman Correlation")
plt.xlabel("")
plt.ylabel("Per-Sample Spearman Correlation")
# g.set_titles(col_template="Recon1 signed correlations")
# textwrap x tick labels, no rotation (remember it's a multiplot, do once per subplot)
# for ax in g.axes.flat:
#     labels = [textwrap.fill(label.get_text(), 15) for label in ax.get_xticklabels()]
#     ax.set_xticklabels(labels)
g._legend.set_title("Method")
g._legend.set_bbox_to_anchor((1.22, 0.7))
plt.tight_layout()

filename =  "recon1_mapping.png"
g.savefig(os.path.join(export_path, filename))

plt.show()


In [None]:
input_target_values = full_correlation_df['input data'] + ', ' + full_correlation_df['target data']
filter = input_target_values.str.contains('|'.join(input_target_pairs))
plot_full_corrs(full_correlation_df.loc[filter], "Recon1")