In [1]:
import pandas as pd 
import numpy as np
from Utils_MDR1 import cancertype_clusters, variant_info
from typing import Tuple

## Functions

In [5]:
''' Gets the expression of patients in a certain cancer type cluster. Calc p-value '''

def get_pval_per_cluster(cluster_name: str, expression_df: pd.DataFrame, patients_with_mut: list, num_randomizations: int) -> Tuple[float, float, float]:
    
    cancers = cancertype_clusters[cluster_name] #cancer types in the cluster

    expressions_cur_cluster = expression_df[expression_df["project"].isin(cancers)].copy() #all the expression data of patients in this cluster
    expression_with_mut = expressions_cur_cluster[expressions_cur_cluster["case_id"].isin(patients_with_mut)].copy() #out of those, keep the expresssion of mutated patients
    if expression_with_mut.shape[0] == 0: 
        print(f"No patients in the mutated group are in the {cluster_name} cluster")
    
    expression_without_mut = expressions_cur_cluster[~expressions_cur_cluster['case_id'].isin(patients_with_mut)].copy() #out of those, keep the expresssion of non-mutated patients
    num_with_mut = expression_with_mut.shape[0]

    #get the mean and std of the carriers group
    mean_expression_mutated =  expression_with_mut["MDR1_expression (FPKM)"].mean() #get the mean of all ptients with the mutation
    std_expression_mutated = expression_with_mut["MDR1_expression (FPKM)"].std() 

    #sample from the control group, "num_randomizations" groups of the same size as the control group
    mean_expressions = np.zeros((num_randomizations,1)) #initilize results vector
    for i in range(num_randomizations):
        mean_sampled_expression = expression_without_mut.sample(n = num_with_mut)["MDR1_expression (FPKM)"].mean() #get the mean expression of the randomly sampled patients
        mean_expressions[i,0] = mean_sampled_expression

    # Check p-value
    p_value_larger = 1 - (np.sum(mean_expression_mutated > mean_expressions) / num_randomizations)
    return(p_value_larger, mean_expression_mutated, np.mean(mean_expressions))



## Main - Table S1

In [15]:
#create a df that holds the MDR1 expression levels of all TCGA patients that have measuremets.
expression_df = pd.read_pickle(f"../Data/MDR1_expression_df.pickle")
#get a dictionary containing the lists of patients with each mut and with the haplotypes. 
patients_dict = pd.read_pickle(f"../Data/mutated_patients_dict.pickle")

num_randomizations = 1000


In [16]:
dict_for_table = {}

## T1236C

In [17]:
variant_name = variant_info[1]["variant_name"]
patients_with_mut = patients_dict[variant_name]

dict_for_table[variant_name] = {}

for cluster in cancertype_clusters.keys():
    dict_for_table[variant_name][cluster] = {}

    pval, carrier_expression, non_carrier_expression = get_pval_per_cluster(cluster, expression_df, patients_with_mut, num_randomizations)
    dict_for_table[variant_name][cluster]["pval"] = pval
    dict_for_table[variant_name][cluster]["effect_size"] = carrier_expression / non_carrier_expression


No patients in the mutated group are in the Metabolic Cancers cluster


## T2677G

In [18]:
variant_name = variant_info[2]["variant_name"]
patients_with_mut = patients_dict[variant_name]

dict_for_table[variant_name] = {}

for cluster in cancertype_clusters.keys():
    dict_for_table[variant_name][cluster] = {}

    pval, carrier_expression, non_carrier_expression = get_pval_per_cluster(cluster, expression_df, patients_with_mut, num_randomizations)
    dict_for_table[variant_name][cluster]["pval"] = pval
    dict_for_table[variant_name][cluster]["effect_size"] = carrier_expression / non_carrier_expression


## T3435C

In [19]:
variant_name = variant_info[3]["variant_name"]
patients_with_mut = patients_dict[variant_name]

dict_for_table[variant_name] = {}

for cluster in cancertype_clusters.keys():
    dict_for_table[variant_name][cluster] = {}

    pval, carrier_expression, non_carrier_expression = get_pval_per_cluster(cluster, expression_df, patients_with_mut, num_randomizations)
    dict_for_table[variant_name][cluster]["pval"] = pval
    dict_for_table[variant_name][cluster]["effect_size"] = carrier_expression / non_carrier_expression


No patients in the mutated group are in the Metabolic Cancers cluster


In [20]:
dict_for_table

{'T1236C': {'Metabolic Cancers': {'pval': 1.0, 'effect_size': nan},
  'Proliferative Cancers': {'pval': 0.15900000000000003,
   'effect_size': 1.608421042685652},
  'Inflammatory Cancers': {'pval': 0.471, 'effect_size': 0.7589521705718972}},
 'T2677G': {'Metabolic Cancers': {'pval': 0.10199999999999998,
   'effect_size': 2.767549401864334},
  'Proliferative Cancers': {'pval': 0.42100000000000004,
   'effect_size': 0.895082451478333},
  'Inflammatory Cancers': {'pval': 0.268, 'effect_size': 1.1971016229986857}},
 'T3435C': {'Metabolic Cancers': {'pval': 1.0, 'effect_size': nan},
  'Proliferative Cancers': {'pval': 0.42800000000000005,
   'effect_size': 0.9448640981559827},
  'Inflammatory Cancers': {'pval': 0.41800000000000004,
   'effect_size': 0.9411369441147944}}}