In [1]:
import sys, os
sys.path.append('./boolODE/')

import ast
import yaml
import time
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm 
from pathlib import Path
import multiprocessing as mp
from optparse import OptionParser

from scipy.integrate import odeint
from sklearn.cluster import KMeans
from typing import Dict, List
from importlib.machinery import SourceFileLoader
from scipy.stats import rankdata
import copy
import re

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import parsing_fnc as fnc
import gen_graph_util as util

In [2]:
# Load initial graph (load long-linear, but change topology to 1:n regulation as initial graph)
# Graph statistics (nodes = 18, edges = 18)
path = "./boolODE/"
grn_init = path + "graph_LL_18"

df = pd.read_csv(grn_init + ".txt", sep="\t", engine="python")

In [3]:
# Random sample transripciton factors and build initail topology
ntfs = 6
tfs = np.random.choice(df.values[:,0], ntfs, replace = False)

for i in range(len(df)):
    tf = tfs[int(i%ntfs)]
    
    if np.random.choice(2) == 0:
        df.values[:,1][i] = "( " + tf + " )"
    else:
        df.values[:,1][i] = "( not ( " + tf + " ) )"
        
genes, all_regul, activate_dict, inactivate_dict = util.load_init_graph(df)

In [4]:
# Set kinetic parameters (use same parameters used in BEELINE)
withRules = list(df['Gene'].values)
allnodes = set()

for ind, row in df.iterrows():
    rhs = row['Rule']
    rhs = rhs.replace('(',' ')
    rhs = rhs.replace(')',' ')
    tokens = rhs.split(' ')
    reg = [t for t in tokens if t not in ['not','and','or','']]
    allnodes.update(set(reg))
    
# filter gene which has no regulate rules
withoutRules = list(allnodes.difference(set(withRules)))

# if no rule, assign self-activation
for n in withoutRules:
    print(n, "has no rule, adding self-activation.")
    df = df.append({'Gene':n,'Rule':n}, ignore_index=True)
    withRules.append(n)
    withoutRules.remove(n)
    
# Assume everything is a gene, so make the corresponding protein
varspecs, genelist, inputs = dict(), list(), list()
for node in withRules:
    varspecs['x_' + node] = ''
    varspecs['p_' + node] = ''
    genelist.append(node)
    
kineticParameterDefaults = {'mRNATranscription':20.,'mRNADegradation':10.,'proteinTranslation':10.,'proteinDegradation':1.0,
                            'heavisideSigma':10.,'signalingTimescale':5.0,'hillCoefficient':10.,'interactionStrength':1.0}

# Max level checks
x_max = kineticParameterDefaults['mRNATranscription']/kineticParameterDefaults['mRNADegradation']
y_max = x_max*(kineticParameterDefaults['proteinTranslation']/kineticParameterDefaults['proteinDegradation'])

hillThreshold = y_max/2
heavisideOmega = 2./y_max

kineticParameterDefaults['x_max'] = x_max
kineticParameterDefaults['y_max'] = y_max
kineticParameterDefaults['hillThreshold'] = hillThreshold
kineticParameterDefaults['heavisideOmega'] = heavisideOmega

parameterNamePrefixAndDefaultsAll = {'n_':kineticParameterDefaults['hillCoefficient'], 'k_':hillThreshold, 
                                     'sigmaH_':kineticParameterDefaults['heavisideSigma']}
parameterNamePrefixAndDefaultsGenes = {'m_':kineticParameterDefaults['mRNATranscription'],'l_x_':kineticParameterDefaults['mRNADegradation'],
                                       'r_':kineticParameterDefaults['proteinTranslation'],'l_p_':kineticParameterDefaults['proteinDegradation']}

proteinlist, interactionStrengths = list(), dict()
parameterSetDF, parameterInputsDF = pd.DataFrame(), pd.DataFrame()

In [5]:
# Setting experiments (activation function, timelength, cell number)

settings = dict()
settings['modeltype'] = 'hill'
settings['num_cells'] = 3
settings['simulation_time'] = 75
settings['integration_step_size'] = 0.01
settings['doParallel'] = False

print("Simulation Time Length: ", int(settings['simulation_time']/settings['integration_step_size']))

par = fnc.assignDefaultParameterValues(parameterNamePrefixAndDefaultsAll,parameterNamePrefixAndDefaultsGenes,withRules, genelist)

Simulation Time Length:  7500
Fixing rate parameters to defaults


In [6]:
ModelSpecs, varmappers, parmappers, model_path = dict(), dict(), dict(), dict()
time_length = int(settings['simulation_time']/settings['integration_step_size'])

In [7]:
# Set perturb rulse for ground-truth graphs
pert_method = "swap"

btw_graphs = 1500
num_graphs, num_pert = time_length//btw_graphs, 5

print("Num Graphs: {}, Num Pert: {}".format(num_graphs,num_pert))
sub_path = path + pert_method + "_time_" + str(time_length) + "_graphs_" + str(num_graphs) + "(" + str(num_pert) + ")_1to2/"

simgraphpath = Path(sub_path + "./graphs/")
if not os.path.exists(simgraphpath):
    os.makedirs(simgraphpath)
    
simmodelpath = Path(sub_path + "./models/")
if not os.path.exists(simmodelpath):
    os.makedirs(simmodelpath)

Num Graphs: 5, Num Pert: 5


In [8]:
sub_path

'./boolODE/swap_time_7500_graphs_5(5)_1to2/'

In [9]:
# Gen ground-truth graphs
for grn_num in range(num_graphs):

    if grn_num == 0: # initial graph_0
        df.to_csv(sub_path + "graphs/graph_"+str(grn_num)+".txt",header=True, index=False, sep="\t")

    if grn_num > 0:
        rows = np.random.choice(genes, num_pert*2, replace = False) - 1
        df.values[:,1][rows] = df.values[:,1][rows[::-1]]
        df.to_csv(sub_path + "graphs/graph_"+str(grn_num)+".txt",header=True, index=False, sep="\t")

    ModelSpec, varmapper, parmapper = util.model_generate(df, settings, withRules, inputs, par, genelist, proteinlist, varspecs)

    ModelSpecs[grn_num]=copy.deepcopy(ModelSpec)
    varmappers[grn_num]=copy.deepcopy(varmapper)
    parmappers[grn_num]=copy.deepcopy(parmapper)

    model_path[grn_num] = fnc.writeModelToFile(grn_num, ModelSpec, varmapper, path = sub_path + "models/")
    fnc.generateInputFiles(df, withoutRules, grn_num, path = sub_path)

In [10]:
# Initialize Expression data - using first Graph (graph_0)
init_Model = ModelSpecs[0]
init_varmap = varmappers[0]
init_parmap = parmappers[0]

####################
rnaIndex = [i for i in range(len(init_varmap.keys())) if 'x_' in init_varmap[i]]
revvarmapper = {v:k for k,v in init_varmap.items()}
proteinIndex = [i for i in range(len(init_varmap.keys())) if 'p_' in init_varmap[i]]

y0 = [ModelSpec['ics'][init_varmap[i]] for i in range(len(init_varmap.keys()))]
ss = np.zeros(len(init_varmap.keys()))

for i,k in init_varmap.items():
    if 'x_' in k:
        ss[i] = 1.0
    elif 'p_' in k:
        if k.replace('p_','') in proteinlist:
            ss[i] = 20.

In [11]:
tmax = settings['simulation_time']
integration_step_size = settings['integration_step_size']
tspan = np.linspace(0,tmax,int(tmax/integration_step_size))

In [12]:
#Load initial expression values
init_exp = {"Genes":["['g1']"], "Values":"[1]"}
icsDF = pd.DataFrame(data=init_exp)

if not icsDF.empty:
    icsspec = icsDF.loc[0]
    genes = ast.literal_eval(icsspec['Genes'])
    values = ast.literal_eval(icsspec['Values'])
    icsmap = {g:v for g,v in zip(genes,values)}
    for i,k in init_varmap.items():
        for g in genelist:
            if g in icsmap.keys():
                ss[revvarmapper['x_'+g]] = icsmap[g]
            else:
                ss[revvarmapper['x_'+g]] = 0.01

result = pd.DataFrame(index=pd.Index([varmapper[i] for i in rnaIndex]))

In [13]:
# Index of every possible time point. Sample from this list
startat = 0
timeIndex = [i for i in range(startat, len(tspan))]

groupedDict = {}

simfilepath = Path(sub_path + "./simulations/")
if not os.path.exists(simfilepath):
    os.makedirs(simfilepath)

In [14]:
# Initialize
new_ics = [0 for _ in range(len(varmapper.keys()))]

# Set the mRNA ics
for ind in rnaIndex:
    if ss[ind] < 0:
        ss[ind] = 0.0
    new_ics[ind] =  ss[ind]
    if new_ics[ind] < 0:
        new_ics[ind] = 0
for p in proteinlist:
    ind = revvarmapper['p_'+p]
    if ss[ind] < 0:
        ss[ind] = 0.0
    new_ics[ind] =  ss[ind]
    if new_ics[ind] < 0:
        new_ics[ind] = 0
        
# Calculate the Protein ics based on mRNA levels
for genename in genelist:
    pss = ((ModelSpec['pars']['r_' + genename])/(ModelSpec['pars']['l_p_' + genename]))*new_ics[revvarmapper['x_' + genename]]
    new_ics[revvarmapper['p_' + genename.replace('_','')]] = pss
    
argdict = {}
argdict['Model'] = model_path
argdict['tspan'] = tspan
argdict['varmapper'] = init_varmap
argdict['timeIndex'] = timeIndex
argdict['genelist'] = genelist
argdict['proteinlist'] = proteinlist
argdict['ss'] = ss
argdict['ModelSpecs'] = ModelSpecs
argdict['parmappers'] = parmappers
argdict['rnaIndex'] = rnaIndex
argdict['proteinIndex'] = proteinIndex
argdict['revvarmapper'] = revvarmapper
argdict['x_max'] = kineticParameterDefaults['x_max']

if settings['doParallel']:
    with mp.Pool() as pool:
        jobs = []
        for cellid in range(settings['num_cells']):
            cell_args = dict(argdict, seed=cellid, cellid=cellid)
            job = pool.apply_async(fnc.simulateAndSample, args=(cell_args,new_ics))
            jobs.append(job)

        for job in jobs:
            job.wait()
# not doing it in a parallel manner
else:
    # for each cellid, give a new experiment with new seed.
    for cellid in tqdm(range(settings['num_cells'])):
        argdict['seed'] = cellid
        argdict['cellid'] = cellid
        fnc.simulateAndSample(argdict, new_ics, path = sub_path)

  0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
# # after running the simulation, sample cells, print expression and pseudotime
# frames = []
# print('starting to concat files')
# for cellid in tqdm(range(settings['num_cells'])):
#     # cellid correspond to the cellid-th experiment, read in the gene expression simulation for the cellid-th experiment
#     df = pd.read_csv(sub_path + './simulations/E'+str(cellid) + '.csv',index_col=0)
#     # df of the form genes by times
#     df = df.sort_index()
#     # times by genes
#     frames.append(df.T)
    
# # experiment * times by genes, 
# result = pd.concat(frames,axis=0)
# result = result.T

# # indices are gene names
# indices = result.index
# newindices = [i.replace('x_','') for i in indices]
# result.index = pd.Index(newindices)

In [16]:
# Re-organizing generated simulation data: 1. cell expression data 2. ground-truth adjacent matrix
# consider the first single cell trajectory

exp_data = pd.read_csv(sub_path + "simulations/E0.csv", index_col = 0) 

_, ntime_1 = exp_data.shape
time_len = ntime_1 + 1

if exp_data.shape[1] != time_len:
    (cell, time) = exp_data.columns[-1].split('_')
    exp_data[cell+'_'+str(int(time)+1)]=exp_data[cell+'_'+time]
    
# Sort based on time - order
sorted_exp = np.array(exp_data)[:,np.argsort([int(col.split("_")[1]) for col in exp_data.columns])]

temp = re.compile("([a-zA-Z]+)([0-9]+)")

gene_dict = dict()
gene_list = [int(gene.split("g")[1]) for gene in exp_data.index]
ngenes = len(gene_list)
gene_rank = rankdata(gene_list, method="min")

for gene, rank in zip(gene_list, gene_rank):
    gene_dict[gene] = rank

ref_net = dict()
n_graphs = len(os.listdir(sub_path + "ground_truth"))

gt_adj = np.zeros((n_graphs,ngenes,ngenes))
for i in range(n_graphs):
    ref_net = pd.read_csv(sub_path + "ground_truth/refNetwork_" +str(i) + ".csv", index_col = 0)
    target = list(ref_net.index)
    regula = list(ref_net.values[:,0])

    target = [int(temp.match(idx).groups()[1]) for idx in target]
    regula = [int(temp.match(idx).groups()[1]) for idx in regula]
    
    target = [gene_dict[i] for i in target]
    regula = [gene_dict[i] for i in regula]
    start_idx = min(gene_rank)
    
    for rule in range(len(target)):
        node0, node1 = target[rule], regula[rule]
        gt_adj[i, node0 - start_idx, node1 - start_idx] += 1

gt_adj_time = np.repeat(gt_adj,btw_graphs,axis = 0)

if btw_graphs > 10:
    freq = "discrete_"
else:
    freq = "continue_"

outpath = "../data/boolODE_Sep13/"
    
np.save(freq+"sorted_exp_1to2.npy",sorted_exp) #(ngenes,ntimes)
np.save(freq+"gt_adj_1to2.npy",gt_adj_time) #(ntimes,ngenes,ngenes)