In [1]:
#[gene for gene, estimates in fluxes_mles['nonsmoking'].items() if len(estimates) == 4]

Note: need to regraph results with the additional genes

for genes whose from 110 gamma is significantly greater than from WT:

    find genes whose from 100 and from 010 gamma are significantly different
        and
    identify which of those is greater
for genes whose from 110 and from WT gammas are not significantly different:

    find genes whose from 100 or from 010 gammas are significantly different from the rest of the gammas

In [2]:
import os
import numpy as np
import pandas as pd

from locations import location_output
from locations import pts_by_mutation_file
from locations import results_keys
from locations import samples_per_combination_files

## * Load mutation rates

mutation_rates = {
    key:pd.read_csv(os.path.join(location_output,
                                 f'{key}_mutation_rates.txt'),
                    index_col=0)['rate'].to_dict()
    for key in results_keys if 'plus' not in key}

mutation_rates['smoking_plus'] = pd.read_csv(os.path.join(location_output,
                                 'smoking_mutation_rates.txt'),
                                 index_col=0)['rate'].to_dict()
                                 
mutation_rates['nonsmoking_plus'] = pd.read_csv(os.path.join(location_output,
                                 'nonsmoking_mutation_rates.txt'),
                                 index_col=0)['rate'].to_dict()

## * Compute fluxes, without epistasis

def compute_lambdas(gammas, mu):
    if isinstance(gammas, float):
        return gammas*mu
    elif isinstance(gammas, list):
        return [gammas[0]*mu, gammas[1]*mu]
    elif isinstance(gammas, dict):
        return {x_y:compute_gammas(the_gamma, mu)
                for x_y, the_gamma in gammas.items()}

## * Load fluxes with epistasis

fluxes_mles = {
    key:np.load(os.path.join(location_output,
                             f'{key}_fluxes_mles.npy'),
                allow_pickle=True).item()
    for key in results_keys}


fluxes_cis = {
    key:np.load(os.path.join(location_output,
                             f'{key}_fluxes_cis.npy'),
                allow_pickle=True).item()
    for key in results_keys}


## * Compute selection coefficients with epistasis

def compute_gammas(lambdas, mu):
    if isinstance(lambdas, float):
        return lambdas/mu
    elif isinstance(lambdas, list):
        return [lambdas[0]/mu, lambdas[1]/mu]
    elif isinstance(lambdas, dict):
        return {x_y:compute_gammas(the_lambda, mu)
                for x_y, the_lambda in lambdas.items()}


selection_mles = {
    key:{gene:compute_gammas(fluxes_mles[key][gene],
                             mutation_rates[key][gene])
         for gene in set.intersection(
                 set(fluxes_mles[key].keys()),
                 set([gene.upper() for gene in mutation_rates[key].keys()]))}
    for key in results_keys}


selection_cis = {
    key:{gene:compute_gammas(fluxes_cis[key][gene],
                             mutation_rates[key][gene])
         for gene in set.intersection(
                 set(fluxes_cis[key].keys()),
                 set([gene.upper() for gene in mutation_rates[key].keys()]))}
    for key in results_keys}


## * Helper function to filter the results with epistasis

def filter_estimates(all_estimates, from_x_to_y, genes=None):
    if genes is None:
        genes = list(all_estimates.keys())
    return {gene:estimates[from_x_to_y]
            for gene, estimates in all_estimates.items()
            if gene in genes}


def filter_110_to_111(all_estimates, genes=None):
    return filter_estimates(all_estimates,
                            ((1, 1, 0), (1, 1, 1)),
                            genes)

def filter_000_to_001(all_estimates, genes=None):
    return filter_estimates(all_estimates,
                            ((0, 0, 0), (0, 0, 1)),
                            genes)

def filter_100_to_101(all_estimates, genes=None):
    return filter_estimates(all_estimates,
                            ((1, 0, 0), (1, 0, 1)),
                            genes)

def filter_010_to_011(all_estimates, genes=None):
    return filter_estimates(all_estimates,
                            ((0, 1, 0), (0, 1, 1)),
                            genes)



def provide_all_relevant_lambdas_and_gammas(results_keys=results_keys):
    """Construct a dictionary with all relevant results for fluxes and
    selection coefficients.

    Keys of the dictionary are tuples of the form:

         (result_key, es, what)

    result_key can be any of:
        - pan_data
        - smoking
        - nonsmoking

    es is the epistasis status:
        - 'no_epi': for no epistasis considered
        - 'epi': if epistasis is considered (in this case we consider
          fluxes and selections from KRAS+TP53 to KRAS+TP53+ the third
          gene in the model)

    what refers to the estimation:
        - 'mles': for the maximum likehood estimator
        - 'cis': for the 95% confidence interval (given as a two item list)

    This function returns a tuple with the lambdas and the
    gammas. Each value of lambdas and gammas is another dictionary
    with the third gene as key and respective estimate as value.

    """

    lambdas = {(key, 'from_110', 'mles'):filter_110_to_111(fluxes_mles[key])
                    for key in results_keys}
    lambdas.update({(key, 'from_110', 'cis'):filter_110_to_111(fluxes_cis[key])
                    for key in results_keys})
    lambdas.update({(key, 'from_000', 'mles'):filter_000_to_001(fluxes_mles[key])
                    for key in results_keys})
    lambdas.update({(key, 'from_000', 'cis'):filter_000_to_001(fluxes_cis[key])
                    for key in results_keys})
    lambdas.update({(key, 'from_100', 'mles'):filter_100_to_101(fluxes_mles[key])
                    for key in results_keys})
    lambdas.update({(key, 'from_100', 'cis'):filter_100_to_101(fluxes_cis[key])
                    for key in results_keys})
    lambdas.update({(key, 'from_010', 'mles'):filter_010_to_011(fluxes_mles[key])
                    for key in results_keys})
    lambdas.update({(key, 'from_010', 'cis'):filter_010_to_011(fluxes_cis[key])
                    for key in results_keys})

    gammas = {(key, 'from_110', 'mles'):filter_110_to_111(selection_mles[key])
                    for key in results_keys}
    gammas.update({(key, 'from_110', 'cis'):filter_110_to_111(selection_cis[key])
                    for key in results_keys})
    gammas.update({(key, 'from_000', 'mles'):filter_000_to_001(selection_mles[key])
                    for key in results_keys})
    gammas.update({(key, 'from_000', 'cis'):filter_000_to_001(selection_cis[key])
                    for key in results_keys})
    gammas.update({(key, 'from_100', 'mles'):filter_100_to_101(selection_mles[key])
                    for key in results_keys})
    gammas.update({(key, 'from_100', 'cis'):filter_100_to_101(selection_cis[key])
                    for key in results_keys})
    gammas.update({(key, 'from_010', 'mles'):filter_010_to_011(selection_mles[key])
                    for key in results_keys})
    gammas.update({(key, 'from_010', 'cis'):filter_010_to_011(selection_cis[key])
                    for key in results_keys})
    

    return {'lambdas':lambdas, 'gammas':gammas}

def find_significant_differences(ci_list_1, ci_list_2):
    ci_1_greater = [gene for gene, cis in ci_list_1.items() if ci_list_2[gene][1] < cis[0]]
    ci_2_greater = [gene for gene, cis in ci_list_1.items() if cis[1] < ci_list_2[gene][0]]

    return {'first_greater': ci_1_greater, 'second_greater': ci_2_greater}

## * Number of patients with mutation per gene

pts_per_mutation = pd.read_csv(pts_by_mutation_file, index_col=0)


## * Patients per mutation combination for all TP53, KRAS, and third gene models

samples_per_combination = {
    key:pd.read_csv(samples_per_combination_files[key],
                    index_col='third gene')
    for key in results_keys}

In [259]:
results = provide_all_relevant_lambdas_and_gammas()

In [4]:
signif_differences = find_significant_differences(results['gammas']['pan_data','from_110','cis'], results['gammas']['pan_data','from_000','cis'])
signif_differences['TP53_KRAS']  = signif_differences.pop('first_greater')
signif_differences['WT']  = signif_differences.pop('second_greater')

In [5]:
TP53_KRAS_greater = signif_differences['TP53_KRAS']
WT_greater = signif_differences['WT']

In [6]:
gammas = results['gammas']

In [7]:
TP53_cis = {gene:cis for gene,cis in gammas['pan_data','from_100','cis'].items() if gene in TP53_KRAS_greater}
KRAS_cis = {gene:cis for gene,cis in gammas['pan_data','from_010','cis'].items() if gene in TP53_KRAS_greater}

signif_differences = find_significant_differences(TP53_cis, KRAS_cis)
TP53_greater = signif_differences.pop('first_greater')
KRAS_greater  = signif_differences.pop('second_greater')
similar_gammas = list(set(TP53_KRAS_greater) - set(TP53_greater + KRAS_greater))

In [8]:
print(f'TP53: \n\t{TP53_greater}\nKRAS: \n\t{KRAS_greater}\nSimilar: \n\t{similar_gammas}')

TP53: 
	['FAT1', 'PDGFRA', 'RB1', 'NF1', 'PAPPA2', 'CDC27', 'EML4', 'MET', 'KDR', 'LYST', 'PTEN', 'BRAF', 'ROS1', 'APC', 'RAD17', 'CCND1', 'FGFR1', 'NOTCH2', 'PRKCG', 'ERBB4', 'EPHA7']
KRAS: 
	['RBM10', 'U2AF1', 'STK11', 'KEAP1', 'ATM', 'NKX2-1']
Similar: 
	['PIK3CA', 'CBL', 'PTPRD', 'MSH2', 'KMT2D', 'ERBB2', 'SMARCA4', 'PPP2R1A', 'RET', 'ARID1A', 'FBXW7', 'FGFR4', 'NFE2L2', 'FLT4', 'PXDNL', 'SETD2', 'MGA', 'MYC', 'POLD1', 'MTOR', 'AXL', 'PRKDC', 'BRCA1', 'INHBA', 'MAP2K1', 'BRCA2', 'MECOM', 'SMAD4', 'GRM1', 'PBRM1', 'TLR4', 'NCOR2', 'POLR2A', 'TSC2', 'PARP4', 'AKT1', 'EPHA3', 'RHPN2', 'POLE', 'FGFR3', 'LRP1B', 'TERT', 'ZMYM2', 'TSC1', 'ALK', 'NAV3', 'SETBP1', 'SMG1', 'RYR1', 'CCNE1', 'ATF7IP', 'BRD3', 'SCN8A', 'NTRK2', 'FGFR2', 'SLIT3']


In [290]:
tmp1 = find_significant_differences(TP53_KRAS_cis, KRAS_cis)
tmp2 = find_significant_differences(TP53_KRAS_cis, TP53_cis)

# STK11 has higher gamma from KRAS than from both
tmp1['second_greater']
# No genes have higher gamma from TP53 than from both
tmp2['second_greater']
#29 genes have higher gamma from both than from TP53 or KRAS
len(set(tmp1['first_greater']).intersection(set(tmp2['first_greater'])))

29

In [291]:
both_greater_than_single = set(tmp1['first_greater']).intersection(set(tmp2['first_greater']))

In [308]:
from math import log10

TP53_mles = gammas['pan_data','from_100','mles']
KRAS_mles = gammas['pan_data','from_010','mles']
TP53_KRAS_mles = gammas['pan_data','from_110','mles']

# How much greater is the TP53 & KRAS gamma than the from TP53 or KRAS gamma alone
[(gene,round(log10(mle_both),1), mle_both/TP53_mles[gene], mle_both/KRAS_mles[gene]) for gene, mle_both in TP53_KRAS_mles.items() if gene in both_greater_than_single]

[('PIK3CA', 5.2, 2.3459766631340817, 2.677015236275196),
 ('KMT2D', 5.1, 1.6828664566369838, 1.776685472954977),
 ('FLT4', 4.8, 2.1734642194703064, 2.259821819475277),
 ('SETD2', 5.3, 2.0963003427850695, 2.1726127511155466),
 ('PRKDC', 5.2, 1.7947264719986917, 2.6342291392243395),
 ('FAT1', 5.2, 1.7616672379905869, 3.140072581902895),
 ('TSC2', 4.8, 1.9814905232289415, 2.707849780660968),
 ('ALK', 4.4, 1.9950272107377085, 2.6274122592162343),
 ('CDC27', 4.9, 6.267807707596889, 1.6889414090552113e+40),
 ('U2AF1', 5.1, 7.48647590231456, 2.6966977199365703),
 ('PTPRD', 4.6, 2.0129869198310337, 2.500734140949388),
 ('RET', 4.6, 2.761015727799633, 3.6604666188516877),
 ('FBXW7', 4.6, 2.4594400860576537, 4.101134996377549),
 ('MGA', 5.3, 2.914934832325393, 2.5152054163170092),
 ('MTOR', 4.9, 2.524824206243106, 3.2678798099419892),
 ('KDR', 4.5, 1.5992473687689355, 2.7299730678820717),
 ('PARP4', 4.8, 3.9840876827836493, 5.606860709029201),
 ('FGFR2', 4.6, 2.981992923397988, 3.499229583907838

In [331]:

[[{key:gamma} for gene, gamma in gamma_dict.items() if gene == 'ARID1A'] for key, gamma_dict in gammas.items() if key[0] == 'pan_data' and key[2] == 'mles']

[[{('pan_data', 'from_110', 'mles'): 214328.67626909877}],
 [{('pan_data', 'from_000', 'mles'): 24380.955076490493}],
 [{('pan_data', 'from_100', 'mles'): 111089.03769690602}],
 [{('pan_data', 'from_010', 'mles'): 76523.04997499476}]]

In [294]:
TP53_cis = {gene:cis for gene,cis in gammas['pan_data','from_100','cis'].items() if gene in WT_greater}
KRAS_cis = {gene:cis for gene,cis in gammas['pan_data','from_010','cis'].items() if gene in WT_greater}

find_significant_differences(TP53_cis, KRAS_cis)

{'first_greater': ['EGFR', 'KIF5B'], 'second_greater': []}

In [10]:
WT_cis = {gene:cis for gene,cis in gammas['pan_data','from_000','cis'].items() if gene in WT_greater}
TP53_cis = {gene:cis for gene,cis in gammas['pan_data','from_100','cis'].items() if gene in WT_greater}
KRAS_cis = {gene:cis for gene,cis in gammas['pan_data','from_010','cis'].items() if gene in WT_greater}
TP53_KRAS_cis = {gene:cis for gene,cis in gammas['pan_data','from_110','cis'].items() if gene in WT_greater}

print(find_significant_differences(WT_cis, TP53_cis))
print(find_significant_differences(WT_cis, KRAS_cis))

{'first_greater': [], 'second_greater': ['EGFR']}
{'first_greater': ['EGFR', 'KIF5B'], 'second_greater': []}


The majority of genes (84/93) have significant difference between gamma from WT and from TP53 & KRAS. 82/84 are greater from TP53 & KRAS, only 2 are greater from WT. The lack of consideration of changing mutation rates might contribute to this gap.

For the genes where the gamma is significantly greater after TP53 & KRAS than from WT, most (55/82) have no significant difference in gamma between TP53 and KRAS

21 genes have a greater gamma after TP53 than after KRAS. 
6 genes have a greater gamma after KRAS than after TP53.

For the 2 genes where the gamma from WT is significantly greater than from TP53 & KRAS, both (EGFR & KIF5B) have a higher gamma after TP53 than after KRAS. The gamma from WT is greater than the gamma from KRAS for both genes but when compared to the gamma from TP53, there is an insignificant difference for KIF5B, and for EGFR, the gamma from TP53 is greater.

In [11]:
genes = gammas['pan_data','from_110','cis'].keys()

signif_differences = find_significant_differences(gammas['pan_data','from_110','cis'], gammas['pan_data','from_000','cis'])

similar_gammas = list(set(genes) - set(sum(signif_differences.values(), [])))

In [12]:
TP53_cis = {gene:cis for gene,cis in gammas['pan_data','from_100','cis'].items() if gene in similar_gammas}
KRAS_cis = {gene:cis for gene,cis in gammas['pan_data','from_010','cis'].items() if gene in similar_gammas}
find_significant_differences(TP53_cis, KRAS_cis)

{'first_greater': ['NRAS', 'RASA1'], 'second_greater': []}

Amongst the few genes (9/93) that feature no significant differencein gamma from WT and from TP53 & KRAS, 2 (NRAS & RASA1) have a greater gamma from TP53 than from KRAS. May be due to mutation rate bias again, but NRAS is an equivalent to KRAS and would likely be negatively epistatic with it.

In [13]:
WT_cis = {gene:cis for gene,cis in gammas['pan_data','from_000','cis'].items() if gene in TP53_KRAS_greater}
TP53_cis = {gene:cis for gene,cis in gammas['pan_data','from_100','cis'].items() if gene in TP53_KRAS_greater}
KRAS_cis = {gene:cis for gene,cis in gammas['pan_data','from_010','cis'].items() if gene in TP53_KRAS_greater}
TP53_KRAS_cis = {gene:cis for gene,cis in gammas['pan_data','from_110','cis'].items() if gene in TP53_KRAS_greater}

print(find_significant_differences(WT_cis, TP53_cis))
print(find_significant_differences(WT_cis, KRAS_cis))

{'first_greater': [], 'second_greater': ['PIK3CA', 'MSH2', 'KMT2D', 'FLT4', 'PXDNL', 'SETD2', 'MYC', 'PRKDC', 'FAT1', 'RBM10', 'PBRM1', 'TLR4', 'TSC2', 'PDGFRA', 'AKT1', 'RB1', 'FGFR3', 'TERT', 'NF1', 'TSC1', 'ALK', 'PAPPA2', 'SLIT3', 'EML4', 'STK11', 'MET', 'PTPRD', 'SMARCA4', 'RET', 'FBXW7', 'MGA', 'MTOR', 'AXL', 'KDR', 'PARP4', 'LRP1B', 'ZMYM2', 'LYST', 'KEAP1', 'ATM', 'NAV3', 'PTEN', 'SMG1', 'RYR1', 'CCNE1', 'FGFR2', 'BRAF', 'CBL', 'ROS1', 'ERBB2', 'PPP2R1A', 'ARID1A', 'NFE2L2', 'APC', 'POLD1', 'BRCA1', 'INHBA', 'BRCA2', 'CCND1', 'EPHA3', 'POLE', 'FGFR1', 'NOTCH2', 'SETBP1', 'ATF7IP', 'SCN8A', 'NKX2-1', 'PRKCG', 'ERBB4', 'FGFR4', 'MAP2K1', 'MECOM', 'SMAD4', 'GRM1', 'NCOR2', 'POLR2A', 'EPHA7', 'BRD3', 'NTRK2']}
{'first_greater': ['CDC27', 'EML4', 'RAD17'], 'second_greater': ['PIK3CA', 'KMT2D', 'FLT4', 'PXDNL', 'SETD2', 'MYC', 'PRKDC', 'FAT1', 'RBM10', 'PBRM1', 'TLR4', 'TSC2', 'PDGFRA', 'TERT', 'NF1', 'TSC1', 'ALK', 'PAPPA2', 'U2AF1', 'SLIT3', 'STK11', 'PTPRD', 'SMARCA4', 'RET', 'FBX

In [14]:
tmp1 = find_significant_differences(WT_cis, gammas['pan_data','from_100','cis'])
tmp2 = find_significant_differences(WT_cis, gammas['pan_data','from_010','cis'])

print(set(tmp1['second_greater']) - set(tmp2['second_greater']))
print(set(tmp2['second_greater']) - set(tmp1['second_greater']))

{'EML4', 'BRAF', 'ROS1', 'MET', 'MSH2', 'PRKCG', 'MAP2K1', 'MECOM', 'CCND1', 'NCOR2', 'PARP4', 'AKT1', 'RB1', 'FGFR3', 'ZMYM2', 'LYST', 'PTEN', 'ATF7IP', 'BRD3', 'FGFR2'}
{'U2AF1'}


Of the 82 genes, the majority feature a higher gamma from TP53 or KRAS than from WT. Most of these overlap but the gammas from TP53 feature a few significant genes that are not present in the significant gammas from KRAS.

## Reason for EGFR epistatic difference between smokers and nonsmokers

In [28]:
#EGFR_pathway_genes = ['MUC','CTNNB1','CDH1','STAT','JAK','SRC','VAV', 'RHO']
EGFR_pathway_genes = ['BRAF','MEK','ERK','MAPK','MAP2K1','PIK3CA','PTEN','AKT1','NFKB','RAS','RAF','RAC','TSC1','TSC2','NF1','JAK1','JAK2','SHC','SOS','GRB2','STAT','MYC','FOXO3A','IRS1','IRS2','PDK','AMPK1','STK11']
RAS_pathway_genes = pd.read_csv('~/Downloads/PID_RAS_PATHWAY.v7.5.1.tsv', sep = '\t').iloc[18,1].split(',')
PI3K_AKT_pathway_genes = pd.read_csv('~/Downloads/PID_PI3KCI_AKT_PATHWAY.v7.5.1.tsv', sep = '\t').iloc[18,1].split(',')
MTOR_pathway_genes = pd.read_csv('~/Downloads/PID_MTOR_4PATHWAY.v7.5.1.tsv', sep = '\t').iloc[18,1].split(',')

EGFR_pathway_genes = list(set(EGFR_pathway_genes + RAS_pathway_genes + PI3K_AKT_pathway_genes + MTOR_pathway_genes))
#set(EGFR_pathway_genes) - set(RAS_pathway_genes + PI3K_AKT_pathway_genes + MTOR_pathway_genes)
EGFR_pathway_genes.pop(EGFR_pathway_genes.index('KRAS'))

'KRAS'

Below is an incomplete way to find the genes in our gene list that are part of the EGFR pathway. My manually defined genes may be gene families whose members may be in our gene list but are not detected as such.

In [31]:
from locations import gene_list_file
gene_list = list(pd.read_csv(gene_list_file, header=None)[0])
gene_list = [gene.upper() for gene in gene_list]
gene_list = gene_list[:103]

EGFR_pathway_genes = [gene for gene in EGFR_pathway_genes if gene in gene_list]


In [47]:
from filter_data import key_filtered_dbs, filter_samples_for_genes
from count_combinations import updated_compute_samples

keys = ['pan_data','smoking_plus','nonsmoking_plus']


pts_per_combination = {key:{gene: updated_compute_samples(filter_samples_for_genes(['KRAS','EGFR',gene], key_filtered_dbs[key]), 
                                                          mutations = ['KRAS','EGFR',gene])
                            for gene in EGFR_pathway_genes} 
                       for key in keys}

In [48]:
from main import are_all_fluxes_computable

def at_least_000_to_010_and_001_to_011(samples):
    return np.all(samples[[0,1]] > 0)

fluxes_computable = {key:{gene:are_all_fluxes_computable(samples) 
                          for gene, samples in patient_counts.items()} 
                     for key, patient_counts in pts_per_combination.items()}

some_fluxes_computable = {key:{gene:at_least_000_to_010_and_001_to_011(samples) 
                               for gene, samples in patient_counts.items()} 
                          for key, patient_counts in pts_per_combination.items()}

for key in keys:
    print(key)
    print('all fluxes computable: ', [gene for gene, value in fluxes_computable[key].items() if value == True])
    print('at least 000 to 010 or 001 to 011 computable: ', [gene for gene, value in some_fluxes_computable[key].items() if value == True])

pan_data
all fluxes computable:  ['PIK3CA', 'MYC', 'PRKDC', 'TSC2', 'AKT1', 'NF1', 'TSC1', 'STK11', 'NRAS', 'HRAS', 'MTOR', 'RASA1', 'PTEN', 'CCNE1', 'BRAF']
at least 000 to 010 or 001 to 011 computable:  ['PIK3CA', 'MYC', 'PRKDC', 'TSC2', 'AKT1', 'NF1', 'TSC1', 'STK11', 'NRAS', 'HRAS', 'MTOR', 'RASA1', 'PTEN', 'CCNE1', 'BRAF', 'MAP2K1']
smoking_plus
all fluxes computable:  ['PIK3CA', 'PRKDC', 'TSC2', 'NF1', 'TSC1', 'STK11', 'MTOR', 'RASA1', 'PTEN', 'BRAF']
at least 000 to 010 or 001 to 011 computable:  ['PIK3CA', 'MYC', 'PRKDC', 'TSC2', 'AKT1', 'NF1', 'TSC1', 'STK11', 'NRAS', 'HRAS', 'MTOR', 'RASA1', 'PTEN', 'CCNE1', 'BRAF', 'MAP2K1']
nonsmoking_plus
all fluxes computable:  []
at least 000 to 010 or 001 to 011 computable:  ['PIK3CA', 'MYC', 'PRKDC', 'TSC2', 'AKT1', 'NF1', 'TSC1', 'STK11', 'NRAS', 'HRAS', 'MTOR', 'RASA1', 'PTEN', 'BRAF', 'MAP2K1']


In [19]:
#no samples with EGFR + MAP2K1 in pan_data which is somewhat expected
gene = 'BRAF'
key = 'smoking'
genes = ['KRAS','EGFR',gene]

print(
    updated_compute_samples(filter_samples_for_genes(genes, key_filtered_dbs[key]), 
                            mutations = genes,
                            print_info = True)
)

key = 'nonsmoking'

print(
updated_compute_samples(filter_samples_for_genes(genes, key_filtered_dbs[key]), 
                        mutations = genes,
                        print_info = True)
)

   KRAS  EGFR  BRAF  Sample Count
0     0     0     0           377
1     0     0     1            35
2     0     1     0            40
3     0     1     1             6
4     1     0     0           220
5     1     0     1             6
6     1     1     0             1
7     1     1     1             1
[377  35  40   6 220   6   1   1]
   KRAS  EGFR  BRAF  Sample Count
0     0     0     0           182
1     0     0     1             9
2     0     1     0            78
3     0     1     1             2
4     1     0     0            22
5     1     0     1             0
6     1     1     0             0
7     1     1     1             0
[182   9  78   2  22   0   0   0]


TODO: 

have compute_all_lambdas function that would work to calculate lambdas for EGFR pathway genes, but it is not adapted into produce_results, which we need for plotting
 - may be better to just to group into pathways first

also need to try the gene-compounding approach to calculate samples

In [40]:
EGFR_pathway_genes

['PIK3CA',
 'MYC',
 'PRKDC',
 'TSC2',
 'AKT1',
 'NF1',
 'TSC1',
 'STK11',
 'NRAS',
 'HRAS',
 'MTOR',
 'RASA1',
 'PTEN',
 'CCNE1',
 'BRAF',
 'MAP2K1']

In [246]:
from itertools import chain

mutations = {'KRAS':['KRAS'],'EGFR':['EGFR'],'EGFR_pathway':EGFR_pathway_genes}
key = 'pan_data'

db = filter_samples_for_genes(list(chain(*mutations.values())), key_filtered_dbs[key])

In [247]:
#create new column that represents if any gene in the EGFR pathway is mutated
pathway_grouped_db = db.assign(EGFR_pathway=db[EGFR_pathway_genes].sum(axis='columns').apply(lambda x: 1 if x > 1 else x))

samples = updated_compute_samples(pathway_grouped_db, mutations = list(mutations.keys()), print_info=True)

   KRAS  EGFR  EGFR_pathway  Sample Count
0     0     0             0           627
1     0     0             1           414
2     0     1             0           181
3     0     1             1            55
4     1     0             0           315
5     1     0             1           196
6     1     1             0             4
7     1     1             1            11


In [248]:
[i for i, included in enumerate(mutations.values()) if len(included) == 1]

[0, 1]

In [251]:
from results_and_plotting import convert_samples_to_dict

key = 'pan_data'
location_dataset = os.path.join('../to_delete', key)

np.save(os.path.join(location_dataset, 
                        '_'.join(mutations.keys()),
                        'samples.npy'),
                    convert_samples_to_dict(samples))

In [255]:
for key, value in results.items():
    results = {key:np.load(os.path.join(location_dataset, '_'.join(mutations.keys()),
                                        f'{key}.npy'),
                           allow_pickle=True).item()
               for key in results_to_save}

In [256]:
location_results = '../to_delete'
dataset = 'pan_data'
results_to_save = ["samples", "lambdas", "lambdas_cis", "mus",
                   "gammas", "gammas_cis"]

results = {analysis: 
                {key: np.load(os.path.join(location_results, 
                                        dataset, analysis,
                                        f'{key}.npy'),
                           allow_pickle=True).item()
                for key in results_to_save}
               for analysis in [f for f in os.listdir(
                   os.path.join(location_results, dataset)) 
                   if not f.startswith('.')]}