In [1]:
# imports and loadings
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../functions/')
import interface_GAMS as iG
import conversion_equations as ce
sys.path.append('/home/chris/github/Strainware-s/')
sys.path.append('/home/chris/github/Strainware-s/bitome2/')
import bitome
from bitome.core import Bitome
from workflows_CAD.feature_functions import *
import pickle
import os
import ast

# load in settings flags
settings_df = pd.read_csv('../options/settings.csv', index_col = 0)
flags_filepath = settings_df.loc['gene_flags_filepath']['Setting']
TF_flags_filepath = settings_df.loc['TF_flags_filepath']['Setting']

flags_df = pd.read_csv(flags_filepath, index_col = 0)

# RNAP Concentration per sample

In [2]:
# settings
bott_pct = 0.01
top_pct = 0.99

# setup
# merge together log_tpm_df files
log_tpm_df = pd.read_csv('../data/external/imodulon_info/log_tpm.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/external/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)
tpm_df = 2**log_tpm_df

# let's find the fold change in the middle 95% percentile of this distribution, make the current RNAP the center, and scale thigns to both sides
# do this for both genes, each sample gets the RNAP value that's the average of both
RNAP_default = 10**-6
fold_modify = 2 # modify the fold amount by this factor (divided by if below, multiplied by if above)

genes = ['b3987', 'b3988']
RNAP_df = pd.DataFrame(index = tpm_df.columns)
for gene in genes:
    # calculations
    bott = tpm_df.loc[gene].sort_values().iloc[int(bott_pct*len(tpm_df.columns))]
    top = tpm_df.loc[gene].sort_values().iloc[int(top_pct*len(tpm_df.columns))]
    fold_change = top/bott
    middle = np.median(tpm_df.loc[gene])

    # scale everything
    new_col = []
    for sample in tpm_df.columns:
        # set outleirs to the min/max
        if tpm_df.loc[gene][sample] > top:
            tpm = top
        elif tpm_df.loc[gene][sample] < bott:
            tpm = bott
        else:
            tpm = tpm_df.loc[gene][sample]
        
        # scale to fold change, then by RNAP value
        fold_change = tpm / middle
        if tpm < middle:
            fold_change /= fold_modify
        else:
            fold_change *= fold_modify
        new_val = RNAP_default * fold_change
        
        new_col.append(new_val)
    RNAP_df[gene] = new_col

In [3]:
# save it off
RNAP_mean_df = pd.DataFrame(RNAP_df.mean(axis = 1))
RNAP_mean_df.columns = ['RNAP']
RNAP_mean_df.to_csv('../data/interim/sample_constants/RNAP_conc.csv')

# Sequence based motif scoring

In [4]:
# setup
bitome_genbank = Bitome('../data/external/sequence_info/NC_000913.3.gb')
bitome_fasta = Bitome('../data/external/sequence_info/NC_000913.3.fasta')
tu_table = pd.read_csv('../data/external/sequence_info/tu.csv')
tf_pwm_db = rpwm('../data/external/sequence_info/motif_pwm_db.txt')
tfbs = pd.read_csv('../data/external/sequence_info/TFBS_regulondb.csv')
TF_flags_df = pd.read_csv(TF_flags_filepath, index_col = 0)

Alias location.position is deprecated and will be removed in a future release. Use location directly, or int(location). However, that will fail for UnknownPosition, and for OneOfPosition and WithinPosition will give the default rather than left-most value.


In [5]:
# runs pipeline for all of our relevant genes
gene_to_operon_to_motif_scores_dist_to_TSS = {}
    
genes_test = TF_flags_df.TF
for gene in genes_test:
    gene = gene[0].upper()+gene[1:]
    if gene not in tf_pwm_db:
        continue
    gene_to_operon_to_motif_scores_dist_to_TSS.update({gene : {}})
    gene_pwm_old = tf_pwm_db[gene]
    gene_pssm = {base: [pos_dict[base] for pos_dict in gene_pwm_old] for base in 'ATCG'}

    # pull out data
    actual_val = []
    gene_rows = tfbs[[val.upper() == gene.upper() for val in tfbs['TF_name']]]
    left_end_positions = gene_rows['TF_bs_left_end_position'].tolist()
    right_end_positions = gene_rows['TF_bs_right_end_position'].tolist()
    for index, row in gene_rows.iterrows():
        name = row['Transcription_unit_name_regulated_by_TF']
        left = row['TF_bs_left_end_position']
        right = row['TF_bs_right_end_position']
        try:
            actual_val.append([name, int(left), int(right)])
        except:
            continue
    actual_df = pd.DataFrame(actual_val, columns=['name', 'left', 'right'])
    
    # tu_dict?
    tu = list(actual_df['name'])
    tu_dict = {item: tu.count(item) for item in tu}
    
    tu_table = pd.read_csv('../data/external/sequence_info/tu.csv')
    tu_name=[]
    locus_tags = []
    score_gene=[]
    location_left =[]
    location_right = []
    matches = []
    for index, row in tu_table.iterrows():
        i = row['strand']
        j = row['tss']
        k = row['name']
        #i,j,k in zip(tu_table['strand'],tu_table['tss'],tu_table['name']):
        for m,n in (tu_dict.items()):
            if i == 1.0 and j.is_integer():
                if m==k:
                    #print(m)
                    tu_name.append([m] * n)  
                    locus_tags.append([row['locus_tag']] * n)
                    score_gene.append(bitome_fasta.motif_search(j-100, j+50, 1, gene_pssm, n_best_matches=n)['log_odds'].tolist())
                    location_left.append(bitome_fasta.motif_search(j-100, j+50, 1, gene_pssm, n_best_matches=n)['left'].tolist())
                    location_right.append(bitome_fasta.motif_search(j-100, j+50, 1, gene_pssm, n_best_matches=n)['right'].tolist())
                    matches.append(bitome_fasta.motif_search(j-100, j+50, 1, gene_pssm, n_best_matches=n)['match_sequence'].tolist())
            if i == -1.0 and j.is_integer():
                if m==k:
                    tu_name.append([m] * n)
                    locus_tags.append([row['locus_tag']] * n)
                    score_gene.append(bitome_fasta.motif_search(j-50, j+100, -1, gene_pssm, n_best_matches=n)['log_odds'].tolist())
                    location_left.append(bitome_fasta.motif_search(j-50, j+100, -1, gene_pssm, n_best_matches=n)['left'].tolist())
                    location_right.append(bitome_fasta.motif_search(j-50, j+100, -1, gene_pssm, n_best_matches=n)['right'].tolist())
                    matches.append(bitome_fasta.motif_search(j-100, j+50, 1, gene_pssm, n_best_matches=n)['match_sequence'].tolist())
            if not j.is_integer():
                if m==k:
                    tu_name.append([m] * 1)
                    locus_tags.append([row['locus_tag']] * 1)
                    score_gene.append([0])
                    location_left.append([0])
                    location_right.append([0])
                    matches.append([''])
                    
    tu_name_flat_list = [item for sublist in tu_name for item in sublist]
    score_gene_flat_list = [item for sublist in score_gene for item in sublist]
    location_left_flat_list = [item for sublist in location_left for item in sublist]
    location_right_flat_list = [item for sublist in location_right for item in sublist]
    matches_list = [item for sublist in matches for item in sublist]
    locus_tags = [item for sublist in locus_tags for item in sublist]

    data = {
        'TU': tu_name_flat_list,
        'locus_tag' : locus_tags,
        'Score': score_gene_flat_list,
        'Location_left': location_left_flat_list,
        'Location_right': location_right_flat_list,
        'Match_sequence' : matches_list,
    }
    predicted_df = pd.DataFrame(data)
    #predicted_df = predicted_df.drop([2, 3])
    predicted_df = predicted_df.reset_index(drop=True)
    predicted_df = predicted_df.drop_duplicates()
    predicted_df.to_pickle('../data/interim/bitome_results/'+gene+'.pkl')
    
    for index, row in predicted_df.iterrows():
        if row['TU'] not in gene_to_operon_to_motif_scores_dist_to_TSS[gene]:
            gene_to_operon_to_motif_scores_dist_to_TSS[gene].update({row['TU'] : []})
        bby_TU = tu_table[tu_table['locus_tag'] == row['locus_tag']]
        if bby_TU['strand'].values[0] == -1:
            # aimed to the left, so TSS - motif
            dist_to_TSS = bby_TU['tss'].values[0] - ((row['Location_left'] + row['Location_right']) / 2)
        else:
            # aimed to the right, so motif - TSS
            dist_to_TSS = ((row['Location_left'] + row['Location_right']) / 2) - bby_TU['tss'].values[0]
        #dist_to_TSS = min([abs(bby_TU['tss'] - row['Location_left']).values[0], abs(bby_TU['tss'] - row['Location_right']).values[0]])
        next_val = (row['Score'], dist_to_TSS)
        gene_to_operon_to_motif_scores_dist_to_TSS[gene][row['TU']].append(next_val)

pickle_out = open('../data/interim/misc_dictionaries/gene_to_operon_to_motif_scores_dist_to_TSS.pkl', 'wb')
pickle.dump(gene_to_operon_to_motif_scores_dist_to_TSS, pickle_out)
pickle_out.close()

# select Kd for metabolites

In [6]:
# setting for KdMetabolite
min_max_fold_change = 100 # min will be min metab divided by this, max will be max metab times this

# loading
starve_metabs = pd.read_csv('../data/external/validation_data_sets/stationary_phase/cleaned_metabolites_data.csv', index_col = 0)
starve_metabs[starve_metabs.columns[1:]] *= 1e-6

# KdMetabolite needs to change more varied condition by condition, maybe let it be a variable and just set its upper and lower limits here?
# it shouldn't change between samples though, just one hard number
TF_flags_df = pd.read_csv(TF_flags_filepath, index_col = 0)


# first let's find min and max to set for those that don't have anything
min_kd = 9e99
max_kd = 0
for index, row in TF_flags_df.iterrows():
    act_ct = 0
    inh_ct = 0
    for col in TF_flags_df.columns:
        if 'cAct' in col:
            act_ct += TF_flags_df.at[index, col]
        elif 'cInh' in col:
            inh_ct += TF_flags_df.at[index, col]
    if act_ct > 0:
        change_col = 'kd_act_metab'
        other = 'kd_inh_metab'
    if inh_ct > 0:
        change_col = 'kd_inh_metab'
        other = 'kd_act_metab'
    try:
        metabolites = ast.literal_eval(row['effectors'])
    except:
        metabolites = []
    metabolites = list(set(metabolites).intersection(starve_metabs.index))
    if len(metabolites) == 0:
        continue
    
    KdArg_max = max(starve_metabs.loc[metabolites][starve_metabs.columns[1:]].sum().values.flatten())
    KdArg_min = min(starve_metabs.loc[metabolites][starve_metabs.columns[1:]].sum().values.flatten())
    if KdArg_min < min_kd:
        min_kd = KdArg_min
    if KdArg_max > max_kd:
        max_kd = KdArg_max

for index, row in TF_flags_df.iterrows():
    act_ct = 0
    inh_ct = 0
    for col in TF_flags_df.columns:
        if 'cAct' in col:
            act_ct += TF_flags_df.at[index, col]
        elif 'cInh' in col:
            inh_ct += TF_flags_df.at[index, col]
    if act_ct > 0:
        change_col = 'kd_act_metab'
        other = 'kd_inh_metab'
    if inh_ct > 0:
        change_col = 'kd_inh_metab'
        other = 'kd_act_metab'
    try:
        metabolites = ast.literal_eval(row['effectors'])
    except:
        metabolites = []
    metabolites = list(set(metabolites).intersection(starve_metabs.index))
    if len(metabolites) == 0:
        # set dummies and continue
        TF_flags_df.at[index, 'kd_act_metab_up'] = max_kd * min_max_fold_change
        TF_flags_df.at[index, 'kd_act_metab_lo'] = min_kd / min_max_fold_change
        TF_flags_df.at[index, 'kd_inh_metab_up'] = max_kd * min_max_fold_change
        TF_flags_df.at[index, 'kd_inh_metab_lo'] = min_kd / min_max_fold_change
        continue
    
    KdArg_max = max(starve_metabs.loc[metabolites][starve_metabs.columns[1:]].sum().values.flatten())
    KdArg_min = min(starve_metabs.loc[metabolites][starve_metabs.columns[1:]].sum().values.flatten())
    TF_flags_df.at[index, change_col+'_up'] = KdArg_max * min_max_fold_change
    TF_flags_df.at[index, change_col+'_lo'] = KdArg_min / min_max_fold_change
    
    # need to set dummy values to the other (won't be used, but will error out GAMS if I don't)
    TF_flags_df.at[index, other+'_up'] = KdArg_max * min_max_fold_change
    TF_flags_df.at[index, other+'_lo'] = KdArg_min / min_max_fold_change
    
TF_flags_df.fillna(0).to_csv(TF_flags_filepath)

# proteomics conversions for constraints

In [7]:
# loading
metab_df = pd.read_excel('../data/external/validation_data_sets/heinemann_protein_conc.xlsx', index_col = 0)
metab_cols = ['Glucose', 'Acetate', 'Glycerol', 'Pyruvate', 'Galactose ', 'Succinate', 'Fructose']

# load in un-logged tpm
# merge together log_tpm_df files
log_tpm_df = pd.read_csv('../data/external/imodulon_info/log_tpm.csv', index_col = 0)
starve_log_tpm = pd.read_csv('../data/external/validation_data_sets/stationary_phase/cleaned_log_tpm_qc.csv', index_col = 0)
to_blank_inds = list(set(log_tpm_df.index) - set(starve_log_tpm.index))
# need to create zero rows for missing values
zeros_data = {col : 0 for col in starve_log_tpm.columns}
zeros_df = pd.DataFrame(zeros_data, index = to_blank_inds)
starve_log_tpm = pd.concat([starve_log_tpm, zeros_df])
starve_log_tpm = starve_log_tpm.loc[log_tpm_df.index]
log_tpm_df = pd.concat([starve_log_tpm, log_tpm_df], axis = 1)


tpm_df = 2**log_tpm_df
overlap = list(set(tpm_df.index).intersection(set(metab_df.index)))
metab_df = metab_df.loc[overlap]
tpm_df = tpm_df.loc[overlap]




# scale tpm_df by the minimum and maximum
new_df = pd.DataFrame(index = tpm_df.index, columns = tpm_df.columns)

vals = metab_df[metab_cols].values.flatten()
vals = vals[vals != 0]
global_min = min(vals)
global_max = max(vals)

to_rem = []
for gene in overlap:
    min_val = min(metab_df[metab_cols].loc[gene])
    max_val = max(metab_df[metab_cols].loc[gene])
    if min_val == max_val: # no good data, just pull from overall max and min
        min_val = global_min
        max_val = global_max
        to_rem.append(gene) # used to use this to remove samples

    tpm_min_val = min(tpm_df.loc[gene])
    tpm_max_val = max(tpm_df.loc[gene])
    
    for sample in tpm_df.columns:
        scaled_tpm = (tpm_df.loc[gene][sample] - tpm_min_val) / (tpm_max_val - tpm_min_val)
        new_val = scaled_tpm*(max_val - min_val) + min_val
        new_df.at[gene, sample] = new_val
new_df.to_csv('../data/external/validation_data_sets/converted_log_tpm_in_M.csv')

invalid value encountered in scalar divide
invalid value encountered in scalar divide
invalid value encountered in scalar divide
invalid value encountered in scalar divide
invalid value encountered in scalar divide
invalid value encountered in scalar divide
invalid value encountered in scalar divide
