In [2]:
# import statements
import os
import pandas as pd
import sys
sys.path.insert(0, '../functions/')
import mRNA_ratios

# Griff's stuff
import numpy as np
import dill as pickle
import matplotlib.pyplot as plt

import promoter_solving_core as ps
import GA_core as ga

from deap import algorithms, base, creator, tools
from sympy import *

In [3]:
# settings
flags = { # add additional settings that should be changed between runs here
    # general flags
    'force_rerun' : False,
    'central_gene' : 'b1101',
    'basal_conditions' : ['control__wt_glc__1', 'control__wt_glc__2'],
    'grid_use' : -1, # which of the generated constants grids do you want to use? -1 is the highest KdRNAP
    
    # basal model flags
    'basal_bool': False, # If true, use imported values for basal expression

    # mRNA ratios flags
    'basal_or_hard_val' : 'hard', # use basal conditions for ratio or some hard value
    'hard_val' : 10, # the basal log tpm value
    'use_zerod_A_matrix' : True, # recalculate the ICA matrices based on zeroing out gene values in iModulons not of interest

    # cAct & cInh Grid flags
    'cActivator' : [0,2], # Uses a log10 range
    'cInhibitor': [-1,4], # Uses a log10 range
    
    # GA - best cActivator/cInhibitor flags
    'neg_grid_toss_OR_zero' : 'zero', # options: toss or zero, what to do with negative grid values
    'seed' : 42,
    'n_ind' : 100, # Starting population size
    'mu' : 100, # Number of individuals to select for the next generation
    'lambda_' : 100, # Number of offspring to produce
    'cxpb' : 0.6, # Chance of crossover
    'cx_prob' : 0.6, # Chance that a condition is crossed
    'mutpb' : 0.4, # Chance an individual undergoes mutation
    'mt_prob' : 0.1, # Chance that a condition in individual is mutated
    'n_gen' : 1000, # Number of generations
    'verbose' : False, # Show the logbook as GA runs
    # NOTE: To modify the selection algorithm, modify the toolbox in the GA section of the notebook
    
    # greedy
    'n_iter' : 10, # Number of greedy individuals to produce, takes ~1 min for each
    'max_steps' : 10, # Maximum number of steps before moving on to next condition
    'n_rounds' : 300, # Number of loops of the shuffled conditions with the max steps for each greedy individual

    # GAMs
    'run_on_all' : False, # run on all genes that are in the saved output folder
    'limit_samples' : ['b1101', 'b2143', 'b1817'] # if run_on_all is False, limit to these samples (or which of them are available)
}

# settings
'''
input_parameters = {
    # base condition for mRNA ratio generation parameters
    'gene_of_interest' : 'b2143', 
    'basal_conditions' : ['control__wt_glc__1', 'control__wt_glc__2'],
    'use_zerod_A_matrix' : True, # recalculate the ICA matrices based on zeroing out gene values in iModulons not of interest
    'control_value_log_tpm' : 10, # basal log tpm value hard set for now
    'grid_use' : -1, # which of the generated constants grids do you want to use? -1 is the highest KdRNAP

    # cActivator/cInhibitor GA parameters
    'seed' : 42, # Random seed to use with the np.random.Generator object
    'n_ind' : 100, # Starting population size
    'mu' : 100, # Number of individuals to select for the next generation
    'lambda_' : 100, # Number of offspring to produce
    'cxpb' : 0.6, # Chance of crossover
    'cx_prob' : 0.6, # Chance that a condition is crossed
    'mutpb' : 0.4, # Chance an individual undergoes mutation
    'mt_prob' : 0.1, # Chance that a condition in individual is mutated
    'n_gen' : 1000, # Number of generations
    'verbose' : False, # Show the logbook as GA runs
    # NOTE: To modify the selection algorithm, modify the toolbox in the GA section of the notebook

    'n_iter' : 1, # Number of greedy individuals to produce, takes ~1 min for each
    'max_steps' : 30, # Maximum number of steps before moving on to next condition
    'n_rounds' : 100, # Number of loops of the shuffled conditions with the max steps for each greedy individual
}
'''

gene_to_act_inh_iMs = {
    'b1101' : ['Crp-2', 'DhaR/Mlc'],
    'b1817' : ['Crp-2', 'DhaR/Mlc'],
    'b1818' : ['Crp-2', 'DhaR/Mlc'],
    'b1819' : ['Crp-2', 'DhaR/Mlc'],
    'b2151' : ['Crp-1', 'Crp-2'],
    'b3601' : ['Crp-1', 'Crp-2'],
    'b2239' : ['Crp-2', 'GlpR'],
    'b2240' : ['Crp-2', 'GlpR'],
    'b0723' : ['Crp-2', 'ArcA-1'],
    'b1415' : ['Crp-2', 'Crp-1'],
    'b2597' : ['Crp-2', 'CpxR'],
    'b3403' : ['Crp-2', 'crp-KO'],
    'b4267' : ['Crp-2', 'GntR/TyrR'],
    'b2143' : ['Crp-2', 'YieP'],
}

# Grid constants, use these if basal_bool = False
# NOTE: The names of these variables must match the sympy equation
grid_constants = {
    'KdRNAP': 10**-5,
    'KdRNAPCrp': 2.5118864315095796e-07*1.4,
    'KeqOpening': 10**-0.34444956947383365,
    'RNAP': 10**-6,
}

# Set reproducible random seed used by the GA
rng = np.random.default_rng(seed = flags['seed'])

# this notebook generates the cActivator and cInhibitor values for all genes

In [None]:
# overall setup
equation = sympify('Eq(mRNARatio,((cActivator*KdRNAP + KdRNAPCrp)*(KdRNAP + RNAP + \
    KeqOpening*RNAP))/((1 + cActivator + cInhibitor)*KdRNAP*KdRNAPCrp + \
    cActivator*KdRNAP*(1 + KeqOpening)*RNAP + KdRNAPCrp*(1 + \
    KeqOpening)*RNAP))')
    
for gene in gene_to_act_inh_iMs.keys():
    flags.update({'central_gene' : gene})
    print('working on '+gene, end = '.')
    
    # save for GAMs
    df_name = flags['central_gene']+'_zerod'+str(flags['use_zerod_A_matrix'])+'_cAct_cInh_vals.csv'
    file_out = '../data/save_for_GAMs/'+df_name
    if os.path.exists(file_out) and not flags['force_rerun']:
        print(' - already done, skipping')
        continue # skip if already run
        
        
    # setup
    # Set reproducible random seed used by the GA
    rng = np.random.default_rng(seed = flags['seed'])
    
    
    
    # reads / creates the ratio dataframe
    df_name = flags['central_gene']+'_zerod'+str(flags['use_zerod_A_matrix'])+'_mRNA_ratios_and_MA_vals.csv'
    if not flags['force_rerun'] and os.path.exists('../data/saved_mRNA_ratios_MA_vals/'+df_name):
        ratios_df = pd.read_csv('../data/saved_mRNA_ratios_MA_vals/'+df_name, index_col = 0)
    else:
        ratios_df = mRNA_ratios.calculate_mRNA_ratios_and_MA_values(gene_to_act_inh_iMs[flags['central_gene']][0], gene_to_act_inh_iMs[flags['central_gene']][1], flags)
        ratios_df.to_csv('../data/saved_mRNA_ratios_MA_vals/'+df_name)

    
    # GA
    # DataFrame to hold the Grid
    grid = pd.DataFrame(columns = ['mRNA_ratio','grid'], index = ratios_df.index)
    grid.loc[:,'mRNA_ratio'] = ratios_df.loc[:,'actual_mRNA_ratio']


    # setup grid
    lambda_df = ps.create_lambdas(equation, grid_constants)

    cAct_range = {'cActivator': flags['cActivator']} # Use a log10 range
    cInh_range = {'cInhibitor': flags['cInhibitor']} # Use a log10 range and convert back after creating grid

    for i, condition in enumerate(grid.index):
        # Create a working grid based on cActivator, we will add cInhibitor values 
        # to it to ensure they always result in mRNA ratio
        cAct_grid = ps.create_parameter_grid(num_steps = 101, **cAct_range)
        cAct_grid = [[10**x[0]] for x in cAct_grid]
        cInh_grid = ps.create_parameter_grid(num_steps = 101, **cInh_range)
        cInh_grid = [[10**x[0]] for x in cInh_grid]

        # Use a dict just in case order of tuple to sub into lambda function ever changes
        values = {'mRNARatio': grid.loc[condition,'mRNA_ratio']}
        for ii, pair in enumerate(cAct_grid):
            values['cActivator'] = pair[0] # Add cAct to values dict

            # Create a tuple in the correct order to pass into the lambda function
            values_tuple = tuple([values[p] for p in lambda_df.loc['cInhibitor','order']])

            # Evaluate the lambda function
            cAct_grid[ii] = (cAct_grid[ii][0], (lambda_df.loc['cInhibitor','lambda'](values_tuple))[0])

        values = {'mRNARatio': grid.loc[condition,'mRNA_ratio']}
        for ii, pair in enumerate(cInh_grid):
            values['cInhibitor'] = pair[0] # Add cInh to values dict

            # Create a tuple in the correct order to pass into the lambda function
            values_tuple = tuple([values[p] for p in lambda_df.loc['cActivator','order']])

            # Evaluate the lambda function
            cInh_grid[ii] = ((lambda_df.loc['cActivator','lambda'](values_tuple))[0], cInh_grid[ii][0]) # Need to reverse the tuples to maintain (cAct, cInh) order when combining the two grids

        working_grid = sorted(cAct_grid + cInh_grid)

        # Remove negative elements from working_grid
        if flags['neg_grid_toss_OR_zero'] == 'toss':
            working_grid = [(cAct, cInh) for (cAct, cInh) in working_grid if cAct >= 0 and cInh >= 0]
        elif flags['neg_grid_toss_OR_zero'] == 'zero':
            new = []
            for cAct, cInh in working_grid:
                if cAct >= 0 and cInh >= 0:
                    new.append((cAct, cInh))
                elif cAct < 0 and cInh < 0:
                    new.append((0, 0))
                elif cAct < 0:
                    new.append((0, cInh))
                else: # this is cInh is negative and cAct isn't
                    new.append((cAct, 0))
            working_grid = new

        # Save to grid df
        grid.at[condition, 'grid'] = working_grid
    print('.', end = '')
    
    # GA
    if True:
        creator.create(name = 'fitness',
                       base = base.Fitness,
                       weights = (1.0, -1.0,)) # Set to maximize Spearman correlation of MA_activator and cActivator, and minimize MA_inhibitor and cInhibitor

        creator.create(name = 'individual',
                       base = np.ndarray,
                       shape = (len(grid),), # Number of conditions
                       dtype = np.dtype([('act', float), ('inh', float)]), # Custom dtype
                       fitness = creator.fitness)

        # Import toolbox
        toolbox = base.Toolbox()

        # Register the individual and population functions
        toolbox.register(alias = 'individual',
                         function = ga.generate_individual,
                         individual_class = creator.individual,
                         grid = grid.grid,
                         rng = rng)

        toolbox.register('population',
                         tools.initRepeat,
                         list,
                         toolbox.individual)

        # Register the evaluation function
        toolbox.register(alias = 'evaluate',
                        function = ga.spearman_objective,
                        MA_df = ratios_df.loc[:,['MA_activator','MA_inhibitor']])

        # Register the selection algorithm
        toolbox.register(alias = "select", 
                         function = tools.selNSGA2, 
                         nd = 'log') 
        # I've been using selNSGA2 since it seems to run faster
        #toolbox.register("select", tools.selSPEA2)

        # Register the mutation function
        toolbox.register(alias = 'mutate', 
                         function = ga.mutate, 
                         prob = flags['mt_prob'], 
                         grid = grid.grid, 
                         rng = rng)

        # Register the crossover function
        cx_prob = 0.6 # NOTE: These values were chosen based on brute_force.ipynb
        toolbox.register(alias = "mate", 
                         function = ga.crossover, 
                         prob = flags['cx_prob'],
                         rng = rng)

        # Set the statistics to record the best individual score of each generation in 
        # the logbook
        stats = tools.Statistics(key=lambda ind: np.subtract(ind.fitness.values[0],
                                                             ind.fitness.values[1]))

        # Run the GA
        pop, logbook = ga.mu_plus_lambda(pop = toolbox.population(n = flags['n_ind']), 
                                         toolbox = toolbox, 
                                         rng = rng, 
                                         mu = flags['mu'], 
                                         lambda_ = flags['lambda_'], 
                                         cxpb = flags['cxpb'], 
                                         mutpb = flags['mutpb'], 
                                         n_gen = flags['n_gen'], 
                                         stats = stats, 
                                         verbose = flags['verbose'])
    print('.', end = '')
    # Greedy
    if True:
        total_score, total_sort = ga.best_individual(pop)
        greedy_pop = ga.greedy_algorithm(base_individual = pop[total_sort[-1]], 
                                 n_iterations = flags['n_iter'],
                                 grid = grid.grid,
                                 toolbox = toolbox,
                                 max_steps = flags['max_steps'],
                                 n_rounds = flags['n_rounds'])

        greedy_score, greedy_sort = ga.best_individual(greedy_pop)
        greedy_score[greedy_sort[-1]]
        
        greedy_voting = ga.voting(population = greedy_pop,
                          grid = grid.grid)

        # Convert from condition integer index to grid tuple to create mean_ind
        mean_ind = creator.individual(greedy_pop[greedy_sort[-1]])
        for i, _ in enumerate(mean_ind):
            mean_ind[i] = grid.grid[i][int(greedy_voting._mean[i])]

        mean_ind.fitness.values = toolbox.evaluate(mean_ind)
        
        GAMs_individual = greedy_pop[greedy_sort[-1]] # TODO: EDIT THIS USING FLAGS?

        vals_for_GAMs = pd.DataFrame(index = ratios_df.index,
                                     columns = ['cAct', 'cInh'],)

        vals_for_GAMs.cAct = list(GAMs_individual['act'])
        vals_for_GAMs.cInh = list(GAMs_individual['inh'])
    print('.', end = '')
    # save for GAMs
    df_name = flags['central_gene']+'_zerod'+str(flags['use_zerod_A_matrix'])+'_cAct_cInh_vals.csv'
    file_out = '../data/save_for_GAMs/'+df_name
    vals_for_GAMs.to_csv(file_out)
    print(' done!')

working on b1101. - already done, skipping
working on b1817. - already done, skipping
working on b1818. - already done, skipping
working on b1819. - already done, skipping
working on b2151. - already done, skipping
working on b3601. - already done, skipping
working on b2239. - already done, skipping
working on b2240. - already done, skipping
working on b0723..