In [None]:
import os, sys
import numpy as np
import pandas as pd
import cobra
from matplotlib import pyplot as plt
import requests
import projection_methods
import torch
import time
from ge_processing import SampleNormalizationMethod, ArithmetizationMethod, process_full_ge, ge_to_reaction_activities
import swifter

In [None]:
# Set up gurobi or other optimizer license file if needed
# os.environ['GRB_LICENSE_FILE'] = "/path/to/your/gurobi.lic"

# Load data

## Model

In [None]:
model = cobra.io.read_sbml_model("real_data_experiment_files/data/RECON1.xml") # a metabolic model in sbml format

In [None]:
len(model.metabolites), len(model.reactions), len(model.genes), len(model.compartments)

## Flux data (NCI60 exchanges and intracellular fluxes for A549, MCF7)

In [None]:
exchange_flux_data = pd.read_csv(("real_data_experiment_files/data/nci60_exchange_data.csv")).set_index('sample_id')
# a csv with sample ids as index (agreeing with gene expression and intracellular data) and reaction ids as columns, plus one "sample_id" columns

In [None]:
intracellular_flux_data = pd.read_csv(("real_data_experiment_files/data/intracellular_data.csv")).set_index('sample_id')
# a csv with sample ids as index (agreeing with gene expression and exchange data) and reaction ids as columns, plus one "sample_id" columns. 

# intracellular_flux_data = intracellular_flux_data.rename(columns={c: c.replace("_L(e)", "__L_e") for c in intracellular_flux_data.columns}) # conversions specific to name formats in data and Recon1 model.

print("intracellular #rxns: {}, #non-nan ACH-0000019: {}, # non-nan ACH-000682: {}, intersection: {}".format(
    len(intracellular_flux_data.columns), len(intracellular_flux_data.columns) - intracellular_flux_data.loc['ACH-000019'].isna().sum(),  len(intracellular_flux_data.columns) - intracellular_flux_data.loc['ACH-000681'].isna().sum(),
     len(intracellular_flux_data.columns) - (intracellular_flux_data.isna().sum() >= 1).sum()
))

In [None]:
joint_intracellular_and_exchanges = pd.merge(intracellular_flux_data, exchange_flux_data.loc[intracellular_flux_data.index], left_index=True, right_index=True)

## Expression

In [None]:
expression_data = pd.read_csv(("real_data_experiment_files/data/ccle_gene_expression.csv")).set_index('sample_id')
# a csv file with a "sample_id" column, and the rest of the columns being ids of the model genes (up to a transcript variant separator suffix)
expression_data = expression_data.loc[expression_data.index.isin(exchange_flux_data.index)]

In [None]:
expression_data

In [None]:
gene_transcript_variant_separator = "_"
common_genes = set(expression_data.columns).intersection([g.id.split(gene_transcript_variant_separator)[0] for g in model.genes])
len(expression_data.columns), len(model.genes), len(common_genes)

## Convert expression to activities

In [None]:
# map expression_data columns to gene ids when there's a match with gene name
expression_data = expression_data[expression_data.columns.intersection([g.id.split(gene_transcript_variant_separator)[0] for g in model.genes])]
expression_data

In [None]:
expression_data.transpose().duplicated().sum()

In [None]:
# take mean of duplicated columns (replicas)
expression_data = expression_data.groupby(expression_data.columns, axis=1).mean()

In [None]:
activities_cache_path = "real_data_experiment_files/caching/activities.csv"
model_suffix = "_AT[0-9]+" # a regex version of the gene suffix, to join different transcript variants of a gene

if os.path.exists(activities_cache_path):
    print("Loading activities from cache...")
    activities = pd.read_csv(activities_cache_path, index_col=0)
else:
    print("Computing activities...")
    activities = process_full_ge(model, expression_data, verbose=True, fixed_range_activities=True, model_suffix=model_suffix,
                                 gene_zero_fraction_threshold=0.50, sample_normalization_method=SampleNormalizationMethod.quantile, post_processing_centering=False, arithmetization_mode=ArithmetizationMethod.arithmeangeomean)    
    activities.to_csv(activities_cache_path)
    print(f"Activities saved to cache at {activities_cache_path}")

In [None]:
activities_confident_indices = [i for i in range(len(activities.columns)) if not activities.iloc[:, i].isna().all()]

In [None]:
print("reactions with non-nan activity mapping: ", (activities.isna().sum(axis=0) == 0).sum(), "/", len(activities.columns))

In [None]:
plt.hist(activities.values.flatten())

In [None]:
activities = (activities + 1) / 2
# map an activity in range [0, 1] to the corresponding reaction's [lower_bound, upper_bound]

In [None]:
# assert all reactions are included
assert (activities.columns == [r.id for r in model.reactions]).all()

In [None]:
# set model objective manually if desired (or if model has none)
model_objective = "S6T14g"
model.objective = model_objective
# maximize sense
model.objective.sense = 'max'

In [None]:
# for r in model.reactions:
#     r.bounds = (-1000, 1000)

bounds_epsilon = 1e-5

fva = cobra.flux_analysis.flux_variability_analysis(model,
                                                   fraction_of_optimum=0.9,
                                                   # pfba_factor=1.0,
                                                    loopless=True)

l_bounds_series = pd.Series(fva.minimum.to_numpy() - bounds_epsilon)
u_bounds_series = pd.Series(fva.maximum.to_numpy() + bounds_epsilon)

# duplicate per sample
l_bounds = np.tile(fva.minimum.to_numpy() - bounds_epsilon, (len(activities), 1))
u_bounds = np.tile(fva.maximum.to_numpy() + bounds_epsilon, (len(activities), 1))

# convert to dataframe
l_bounds = pd.DataFrame(index=activities.index, columns=[r.id for r in model.reactions],
                        data=l_bounds)
u_bounds = pd.DataFrame(index=activities.index, columns=[r.id for r in model.reactions],
                        data=u_bounds)

for r in model.reactions:
    r.bounds = (fva.loc[r.id]['minimum'] - bounds_epsilon, fva.loc[r.id]['maximum'] + bounds_epsilon)


In [None]:
# m = model.copy()
x=0
for r in model.reactions:
    if r.bounds[0] == 0 and r.bounds[1] == 0:
        # print(r.id)
        pass
    if fva.loc[r.id]['minimum'] == 0 and fva.loc[r.id]['maximum'] == 0:
        x += 1
        # print(r.id)
print("Blocked: {}/{}".format(x, len(model.reactions)))
# x = cobra.flux_analysis.flux_variability_analysis(model, fraction_of_optimum=0.1, pfba_factor=1.0)
# x


In [None]:
# m = model.copy()
x=0
for r in model.reactions:
    if r.bounds[0] == 0 and r.bounds[1] == 0:
        # print(r.id)
        pass
    if fva.loc[r.id]['minimum'] == 0 and fva.loc[r.id]['maximum'] == 0:
        if not activities[r.id].isna().all():
            x += 1
        # print(r.id)
print("Blocked with activities: {}/ {}".format(x, (activities.isna().sum(axis=0) == 0).sum()))
# x = cobra.flux_analysis.flux_variability_analysis(model, fraction_of_optimum=0.1, pfba_factor=1.0)
# x


In [None]:
# flux map
fluxmapped_activities = activities.copy()
assert (fluxmapped_activities.columns == [r.id for r in model.reactions]).all()
bis = 0
negs = 0
pos = 0
for i, r in enumerate(model.reactions):
    lb = l_bounds_series[i]
    ub = u_bounds_series[i]
    if (lb < -bounds_epsilon) and (ub > bounds_epsilon):
        fluxmapped_activities[r.id] = 0
        bis += 1
    else:
        if ub < -bounds_epsilon:
            # negative, reverse activity meaning
            fluxmapped_activities[r.id] = fluxmapped_activities[r.id] * (lb - ub) + ub
            negs += 1
        else:
            fluxmapped_activities[r.id] = fluxmapped_activities[r.id] * (ub - lb) + lb
            pos += 1
print("Negatives: {}, positives: {}, bidirectionals: {}".format(negs, pos, bis))

In [None]:
# make directories if they don't exist in data_path
export_path = "real_data_experiment_files/outputs/recon1"

def export(df, path):
    full_path = os.path.join(export_path, path)
    os.makedirs(os.path.dirname(full_path), exist_ok=True)
    df.to_csv(full_path)
    
# export activities, fluxmapped activities and flux_rates
export(fluxmapped_activities, "fluxes/fluxmapped_activities.csv")
export(exchange_flux_data, "fluxes/NCI60_exchanges.csv")
for cell_line in intracellular_flux_data.index:
    filtered_flux_df = intracellular_flux_data.loc[[cell_line]] 
    filtered_flux_df = filtered_flux_df.dropna(axis=1)
    export(filtered_flux_df, "fluxes/intracellular_{}.csv".format(cell_line))

## Prediction prep

In [None]:
S = cobra.util.array.create_stoichiometric_matrix(model).astype(float)
# projection = projection_methods.FbaProjectionLowMidConfidence(stoichiometric_matrix=S, unknown_indices=unknown_indices)

In [None]:
def full_apply(model, S, known_indices, filtered_flux_df, l_bounds, u_bounds):
    if len(l_bounds.shape) == 1:
        # duplicate per sample
        l_bounds = np.tile(l_bounds.values, (len(filtered_flux_df), 1))
        u_bounds = np.tile(u_bounds.values, (len(filtered_flux_df), 1))

    unknown_indices = [i for i in range(len(model.reactions)) if i not in known_indices]

    filtered_flux_df = filtered_flux_df.copy()
    known_mat = np.zeros(shape=filtered_flux_df.shape, dtype=float)
    known_mat[:, known_indices] = 1
    mid_bounds = (l_bounds + u_bounds) / 2
    filtered_flux_df.loc[:, :] = np.where(known_mat, filtered_flux_df, mid_bounds)

    method_to_predictions = dict()
    # for known_indices, set l_bounds and u_bounds with a -+100% gap from measured value
    cur_l_bounds = l_bounds.copy()
    cur_u_bounds = u_bounds.copy()
    gap = abs(filtered_flux_df.iloc[:, known_indices]) * 1
    cur_l_bounds[:, known_indices] = filtered_flux_df.iloc[:, known_indices] - gap
    cur_u_bounds[:, known_indices] = filtered_flux_df.iloc[:, known_indices] + gap
    for method in methods:
        # try:
        name = method.__repr__(None)
        print(name)
        # gc.collect()
        # torch.cuda.empty_cache()
        start_time = time.time()
        projection = (method(model=model, stoichiometric_matrix=S, unknown_indices=unknown_indices, 
                             measured_indices=known_indices, l_bounds=cur_l_bounds, 
                             u_bounds=cur_u_bounds, n_iters=10, objective_id=model_objective,
                             acond=1e-5, rcond=1e-5,
                             device=torch.device('cpu'), dtype=torch.float))
                      #.to(dtype=float))
        try:
            projection = projection.to(dtype=float)
        except Exception:
            pass

        predictions = projection.forward(torch.tensor(filtered_flux_df.values, dtype=float, device=torch.device('cpu')), l_bounds=cur_l_bounds, u_bounds=cur_u_bounds)
        predictions = pd.DataFrame(index=filtered_flux_df.index, columns=[r.id for r in model.reactions], data=predictions.cpu().numpy())
    
        end_time = time.time()

        print(f"Time taken for {name}: {end_time - start_time:.2f} seconds")
        method_to_predictions[name] = predictions
        # except Exception:
        #     pass
    return method_to_predictions

In [None]:
def leave_one_out_apply(model, S, known_indices, filtered_flux_df, l_bounds, u_bounds):
    
    if len(l_bounds.shape) == 1:
        # duplicate per sample
        l_bounds = np.tile(l_bounds.values, (len(filtered_flux_df), 1))
        u_bounds = np.tile(u_bounds.values, (len(filtered_flux_df), 1))
    
    unknown_indices = [i for i in range(len(model.reactions)) if i not in known_indices]
    # filtered_flux_df = filtered_flux_df.iloc[:4, :]
    # l_bounds = l_bounds[:4]
    # u_bounds = u_bounds[:4]

    filtered_flux_df = filtered_flux_df.copy()
    known_mat = np.zeros(shape=filtered_flux_df.shape, dtype=float)
    known_mat[:, known_indices] = 1
    mid_bounds = (l_bounds + u_bounds) / 2
    filtered_flux_df.loc[:, :] = np.where(known_mat, filtered_flux_df, mid_bounds)

    
    method_to_predictions = dict()
    for method in methods:
        # try:
        name = method.__repr__(None)
        print(name)
        # gc.collect()
        # torch.cuda.empty_cache()
        start_time = time.time()
        predictions = pd.DataFrame(index=filtered_flux_df.index, columns=[col for i, col in enumerate(model.reactions.list_attr('id')) if i in known_indices])    
        if name == "FBApro": # can reuse the same projection matrix
            projection = method(model=model, stoichiometric_matrix=S, device=torch.device('cpu'), dtype=torch.float, l_bounds=l_bounds, u_bounds=u_bounds,
                               acond=1e-5, rcond=1e-5).to(dtype=float)
        else:
            projection = None
            
        for i, cur in enumerate(known_indices):
            # if i >= 3:
            #     continue
            col = model.reactions[cur].id
            held_out_reference_fluxes = filtered_flux_df.copy()
            shape = held_out_reference_fluxes.shape
            held_out_reference_fluxes.loc[:, col] = mid_bounds[:, cur] # 0
            assert held_out_reference_fluxes.shape == shape
            held_out_unknown_indices = unknown_indices.copy()
            held_out_unknown_indices.append(cur)
            held_out_known_indices = known_indices.copy()
            held_out_known_indices.remove(cur)
            # create context-specific l_bounds and u_bounds with -+100% gap on the measured value for each index in held_out_known_indices
            cur_l_bounds = l_bounds.copy()
            cur_u_bounds = u_bounds.copy()
            for idx in held_out_known_indices:
                gap = abs(held_out_reference_fluxes.iloc[:, idx]) * 1
                cur_l_bounds[:, idx] = held_out_reference_fluxes.iloc[:, idx].values - gap
                cur_u_bounds[:, idx] = held_out_reference_fluxes.iloc[:, idx].values + gap
            if projection is None:
                projection = method(model=model, stoichiometric_matrix=S, 
                                    unknown_indices=held_out_unknown_indices, 
                                    measured_indices=held_out_known_indices, 
                                    l_bounds=cur_l_bounds, u_bounds=cur_u_bounds, 
                                    n_iters=10, objective_id=model_objective,
                                    acond=1e-3, rcond=1e-3,
                                    device=torch.device('cpu'), dtype=torch.float)
                try:
                    projection = projection.to(dtype=float)
                except Exception:
                    pass
            res = projection.forward(torch.tensor(held_out_reference_fluxes.values, device=torch.device('cpu'), dtype=float), l_bounds=cur_l_bounds, u_bounds=cur_u_bounds)
            predictions[col] = res.cpu().numpy()[:, cur]    
        end_time = time.time()
        print(f"Time taken for {name}: {end_time - start_time:.2f} seconds")
        method_to_predictions[name] = predictions
        # except Exception as e:
        #     print("Error running method {}".format(method))
        #     print(e)
        #     pass
    return method_to_predictions

# Exchange fluxes

## Leave-one-out flux predictions

### Compute predictions

In [None]:
methods = [
    projection_methods.FBAWrapper,
    projection_methods.FbaProjection, 
    projection_methods.FbaProjectionLowMidConfidence,
    projection_methods.FbaProjectionHighMidConfidence,
    projection_methods.IMATWrapper,
    projection_methods.MoMAWrapper
]

In [None]:
filtered_flux_df = exchange_flux_data
reference_fluxes = torch.zeros((len(filtered_flux_df.index), len(model.reactions)), dtype=torch.float32, device='cpu')
known_indices = []
for i, r in enumerate(model.reactions.list_attr('id')):
    if "EX_" in r:
        reaction = model.reactions.get_by_id(r)
        # print(r, len(reaction.reactants), len(reaction.products))
        # for m in reaction.metabolites:
        #     if reaction.get_coefficient(m) != -1:
        #         print(reaction.get_coefficient(m))
        # try to find the "_c" version of r in the intracellular rates
        if r in filtered_flux_df.columns:
            # take the mean of the fluxes across all time points
            reference_fluxes[:, i] = torch.tensor(filtered_flux_df[r].values)
            known_indices.append(i)
        else:
            reference_fluxes[:, i] = 0
    else:
        reference_fluxes[:, i] = 0
# make reference_fluxes into a df
reference_fluxes = pd.DataFrame(index=filtered_flux_df.index, columns=[r.id for r in model.reactions], data=reference_fluxes.cpu().numpy())
all_reference_fluxes = reference_fluxes

In [None]:
method_to_predictions = leave_one_out_apply(model, S, known_indices, reference_fluxes, l_bounds_series, u_bounds_series)

### Export predictions

In [None]:
for method_name, predictions in method_to_predictions.items():
    export(predictions, "predictions/leave_one_out/NCI60_exchanges/{}.csv".format(method_name))

# Intracellular fluxes

## Leave-one-out flux predictions

In [None]:
methods = [
    projection_methods.FBAWrapper,
    projection_methods.FbaProjection, 
    projection_methods.FbaProjectionLowMidConfidence,
    projection_methods.FbaProjectionHighMidConfidence,
    projection_methods.IMATWrapper,
    projection_methods.MoMAWrapper
]

In [None]:
for cell_line in intracellular_flux_data.index:
    filtered_flux_df = intracellular_flux_data.loc[[cell_line]] 
    filtered_flux_df = filtered_flux_df.dropna(axis=1)
    reference_fluxes = torch.zeros((len(filtered_flux_df.index), len(model.reactions)), dtype=torch.float32, device='cpu')
    known_indices = []
    for i, r in enumerate(model.reactions.list_attr('id')):
        reaction = model.reactions.get_by_id(r)
        # print(r, len(reaction.reactants), len(reaction.products))
        # for m in reaction.metabolites:
        #     if reaction.get_coefficient(m) != -1:
        #         print(reaction.get_coefficient(m))
        # try to find the "_c" version of r in the intracellular rates
        if r in filtered_flux_df.columns:
            # take the mean of the fluxes across all time points
            reference_fluxes[:, i] = torch.tensor(filtered_flux_df[r].values)
            known_indices.append(i)
        else:
            reference_fluxes[:, i] = 0
    # make reference_fluxes into a df
    reference_fluxes = pd.DataFrame(index=filtered_flux_df.index, columns=[r.id for r in model.reactions], data=reference_fluxes.cpu().numpy())
    
    method_to_predictions = leave_one_out_apply(model, S, known_indices, reference_fluxes.fillna(0), l_bounds_series, u_bounds_series)
    
    for method_name, predictions in method_to_predictions.items():
        export(predictions, "predictions/leave_one_out/intracellular_{}/{}.csv".format(cell_line, method_name))
    

# Intracellular fluxes + NCI60

## Leave-one-out flux predictions

In [None]:
methods = [
    projection_methods.FBAWrapper,
    projection_methods.FbaProjection, 
    projection_methods.FbaProjectionLowMidConfidence,
    projection_methods.FbaProjectionHighMidConfidence,
    projection_methods.IMATWrapper,
    projection_methods.MoMAWrapper
]

In [None]:
for cell_line in joint_intracellular_and_exchanges.index:
    filtered_flux_df = joint_intracellular_and_exchanges.loc[[cell_line]] 
    filtered_flux_df = filtered_flux_df.dropna(axis=1)
    reference_fluxes = torch.zeros((len(filtered_flux_df.index), len(model.reactions)), dtype=torch.float32, device='cpu')
    known_indices = []
    for i, r in enumerate(model.reactions.list_attr('id')):
        reaction = model.reactions.get_by_id(r)
        # print(r, len(reaction.reactants), len(reaction.products))
        # for m in reaction.metabolites:
        #     if reaction.get_coefficient(m) != -1:
        #         print(reaction.get_coefficient(m))
        # try to find the "_c" version of r in the intracellular rates
        if r in filtered_flux_df.columns:
            # take the mean of the fluxes across all time points
            reference_fluxes[:, i] = torch.tensor(filtered_flux_df[r].values)
            known_indices.append(i)
        else:
            reference_fluxes[:, i] = 0
    # make reference_fluxes into a df
    reference_fluxes = pd.DataFrame(index=filtered_flux_df.index, columns=[r.id for r in model.reactions], data=reference_fluxes.cpu().numpy())
    
    method_to_predictions = leave_one_out_apply(model, S, known_indices, reference_fluxes.fillna(0), l_bounds_series, u_bounds_series)
    
    for method_name, predictions in method_to_predictions.items():
        export(predictions, "predictions/leave_one_out/joint_intracellular_and_exchanges_{}/{}.csv".format(cell_line, method_name))
    

# Fluxmapped-GE based predictions

## Full

### Compute predictions

In [None]:
methods = [
    projection_methods.FBAWrapper,
    projection_methods.FbaProjection, 
    projection_methods.FbaProjectionLowMidConfidence,
    projection_methods.FbaProjectionHighMidConfidence,
    projection_methods.RawInputWrapper,
    projection_methods.IMATWrapper,
    projection_methods.MoMAWrapper
]

In [None]:
reference_fluxes = fluxmapped_activities

method_to_predictions = full_apply(model, S, activities_confident_indices, reference_fluxes, l_bounds_series, u_bounds_series)

### Export predictions

In [None]:
## Export predictions
for method_name, predictions in method_to_predictions.items():
    export(predictions, "predictions/full/fluxmapped_activities/{}.csv".format(method_name))