In [1]:
import cptac.pancan as pc
import pandas as pd
import cptac.utils as ut

In [2]:
def get_frequently_mutated(cancer_object, cutoff = 0.1):  
    # Get total tumor count
    clinical_df = cancer_object.get_clinical()
    tumor_status = clinical_df[['Sample_Tumor_Normal']]
    tumor = tumor_status.loc[tumor_status['Sample_Tumor_Normal'] == 'Tumor']
    total_tumor_count = float(len(tumor))
    
    # Get mutations data frame
    somatic_mutations = cancer_object.get_somatic_mutation() 

    # Drop silent mutations for Hnscc, Ovarian, and Ccrcc dataset, and synonymous SNV (i.e. silent) mutations in HNSCC
    if 'Silent' in somatic_mutations['Mutation'].unique():
        somatic_mutations = somatic_mutations.loc[somatic_mutations['Mutation'] != 'Silent']
    if 'RNA' in somatic_mutations['Mutation'].unique():
        somatic_mutations = somatic_mutations.loc[somatic_mutations['Mutation'] != 'RNA'] #ignore RNA in LSCC
    if 'synonymous SNV' in somatic_mutations['Mutation'].unique():
        somatic_mutations = somatic_mutations.loc[somatic_mutations['Mutation'] != 'synonymous SNV']
        
    origin_df = somatic_mutations.reset_index() #prepare to count unique samples
        
    # Create two categories in Mutation column - 'M': Missense, 'T': Truncation
    if cancer_object.get_cancer_type() in ('hnscc') and cancer_object.version() == '0.1':
        dif_mut_names = True
    elif cancer_object.get_cancer_type() in ('colon'):
        dif_mut_names = True
    else: 
        dif_mut_names = False
        
    if dif_mut_names == True:
        missense_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missense_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    
    mutations_replaced_M_T = origin_df.replace(missense_truncation_groups)
    
    # replace non_coding mutations for Gbm
    unique_mutations = len(mutations_replaced_M_T['Mutation'].unique())
    gbm = False
    if cancer_object.get_cancer_type() == 'gbm':
        gbm = True
        non_coding = {'Intron': 'NC', 'RNA': 'NC', "5'Flank": 'NC', "3'Flank": 'NC', 
            "5'UTR": 'NC', "3'UTR": 'NC', 'Splice_Region' : 'NC'}
        mutations_replaced_M_T = mutations_replaced_M_T.replace(non_coding)
        unique_mutations_2 = len(mutations_replaced_M_T['Mutation'].unique())
        
    elif unique_mutations != 2: # Check that all mutation names are catagorized
        print('Warning: New mutation name not classified. Counts will be affected.')
        print(mutations_replaced_M_T['Mutation'].unique())
    
    # Find frequently mutated genes (total fraction > cutoff)
    # Same steps will be repeated for finding the missense and truncation mutation frequencies
    # Step 1 - group by gene and count unique samples
    # Step 2 - format
    # Step 3 - filter using the cutoff and create fraction 
    count_mutations = origin_df.groupby(['Gene']).nunique()
    count_mutations = count_mutations.rename(columns={"Patient_ID": "Unique_Samples_Mut"}) # Step 2 
    count_mutations = count_mutations.drop(['Mutation', 'Location'], axis = 1)
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_count) # Step 3
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when not > cutoff
    filtered_gene_df = fraction_greater_than_cutoff#.dropna() # drop genes below cutoff
    filtered_gene_df.reset_index(inplace=True)
    
    
    # Create and join Missense column (following similar steps as seen above) *Counts missense once in sample
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    count_miss = miss.groupby(['Gene']).nunique()
    missense_df = count_miss.rename(columns={"Patient_ID": "Missense_Mut"})
    missense_df = missense_df.drop(['Mutation', 'Location'], axis = 1)
    fraction_missense = missense_df.apply(lambda x: x / total_tumor_count)
    fraction_missense.reset_index(inplace=True)
    freq_mutated_df = filtered_gene_df.merge(fraction_missense, how='left').fillna(0)
    
    
    # Create and join Truncation column (following similar steps as seen above)
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
    count_trunc = trunc.groupby(['Gene']).nunique()
    truncation_df = count_trunc.rename(columns={"Patient_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Mutation', 'Location'], axis = 1)
    fraction_truncation = truncation_df.apply(lambda x: x / total_tumor_count)
    truncation_df.reset_index(inplace=True)
    freq_mutated_df = freq_mutated_df.merge(fraction_truncation, how='left').fillna(0)
    
    if gbm == True:
        # Create and join non-coding column (following similar steps as seen above)
        nc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'NC']
        count_nc = nc.groupby(['Gene']).nunique()
        nc_df = count_nc.rename(columns={"Patient_ID": "Non-Coding"})
        nc_df = nc_df.drop(['Mutation', 'Location'], axis = 1)
        fraction_nc = nc_df.apply(lambda x: x / total_tumor_count)
        freq_mutated_df = freq_mutated_df.join(fraction_nc, how='left').fillna(0)
        
#     freq_mutated_df = freq_mutated_df.reset_index() #move genes to their own column
    
    return freq_mutated_df



In [3]:
luad = pc.PancanLuad()
lscc = pc.PancanLscc()
hnscc = pc.PancanHnscc()
en = pc.PancanUcec()
ccrcc = pc.PancanCcrcc()

Loading broadluad v1.0...                     



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washuluad v1.0......                     

  all_df = tumor.append(normal)
  all_df = tumor.append(normal)
  all_df = tumor.append(normal)


Loading washuluad v1.0..........



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washuluad v1.0...........Loading washuluad v1.0............Loading washuluad v1.0.............Loading washuluad v1.0..............Loading washuluad v1.0...............Loading washuluad v1.0................Loading washuluad v1.0.................Loading washuluad v1.0..................                                        Formatting washuluad dataframes...

  rna_combined = rna_tumor.append(rna_normal)


Loading broadlscc v1.0...                      



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washulscc v1.0......                     

  all_df = tumor.append(normal)
  all_df = tumor.append(normal)
  all_df = tumor.append(normal)


Loading washulscc v1.0..........



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washulscc v1.0...........Loading washulscc v1.0............Loading washulscc v1.0.............Loading washulscc v1.0..............Loading washulscc v1.0...............Loading washulscc v1.0................Loading washulscc v1.0.................Loading washulscc v1.0..................                                        Formatting washulscc dataframes...

  rna_combined = rna_tumor.append(rna_normal)


Loading broadhnscc v1.0...                     



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washuhnscc v1.0......                    

  all_df = tumor.append(normal)
  all_df = tumor.append(normal)
  all_df = tumor.append(normal)


Loading washuhnscc v1.0..........



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Formatting washuhnscc dataframes...      

  rna_combined = rna_tumor.append(rna_normal)


Loading broaducec v1.0...                      



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washuucec v1.0..........                 



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washuucec v1.0...........Loading washuucec v1.0............Loading washuucec v1.0.............Loading washuucec v1.0..............Loading washuucec v1.0...............Loading washuucec v1.0................Loading washuucec v1.0.................Loading washuucec v1.0..................                                        Formatting washuucec dataframes...

  rna_combined = rna_tumor.append(rna_normal)


Loading broadccrcc v1.0...                     



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washuccrcc v1.0........                  

  all_df = tumor.append(normal)
  all_df = tumor.append(normal)
  all_df = tumor.append(normal)


Loading washuccrcc v1.0..........



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(


Loading washuccrcc v1.0...........Loading washuccrcc v1.0............Loading washuccrcc v1.0.............Loading washuccrcc v1.0..............Loading washuccrcc v1.0...............Loading washuccrcc v1.0................Loading washuccrcc v1.0.................Loading washuccrcc v1.0..................                                         Formatting washuccrcc dataframes...

  rna_combined = rna_tumor.append(rna_normal)


                                               

In [4]:
cancer_list = [luad, lscc, hnscc, en, ccrcc]

In [5]:
cancer_dict = {lscc: 'LSCC', luad: 'LUAD', hnscc: 'HNSCC', en: 'Endometrial', ccrcc: 'CCRCC'}

In [6]:
mutation_frequency_df = []
for cancer in cancer_list:
    mutation_df = cancer.get_somatic_mutation()
    mutation_df = mutation_df[mutation_df.Mutation != 'Silent']
    mutation_df = mutation_df[mutation_df.Mutation != 'RNA']
    mutation_df = mutation_df[mutation_df.Mutation != 'synonymous SNV']
    #look only at gene with both transciptomic and proteomic data
    gene_df = cancer.get_transcriptomics('washu','tumor')
    if isinstance(gene_df.columns, pd.MultiIndex):
        gene_df = gene_df.droplevel('Database_ID', axis = 1)
    prot_df = cancer.get_proteomics('umich','tumor')
    if isinstance(prot_df.columns, pd.MultiIndex):
        prot_df = prot_df.droplevel('Database_ID', axis = 1)
    gene_df['Patient_ID'] = gene_df.index
    prot_df['Patient_ID'] = prot_df.index
    gene_df = gene_df.melt(id_vars = 'Patient_ID',
                           var_name ='Gene',
                           value_name = 'Transcriptomics')
    prot_df = prot_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomics')
    omics_df = pd.merge(gene_df, prot_df, how = 'outer')
    omics_df = omics_df.dropna()
    mutation_df = mutation_df.reset_index()
    df = pd.merge(omics_df, mutation_df, how = 'outer')
    df = df.drop(columns = 'Location')
    df['Mutation'] = df.Mutation.notnull()
#     df = df.dropna()
    #filter to only include genes with a minimum number of samples
    genes = df.groupby(['Mutation', 'Gene'])
    sample_count = genes.size()
    cutoff = 15
    if cancer == en:
        cutoff = 10
    sample_count = sample_count[sample_count>=cutoff]
    sample_count
    mutated_genes = set(sample_count[sample_count.index.get_level_values(0) == False].index.get_level_values('Gene'))
    wt_genes = set(sample_count[sample_count.index.get_level_values(0) == True].index.get_level_values('Gene'))
    
    genes = wt_genes & mutated_genes
    #get mutation frequency 
    df = get_frequently_mutated(cancer, 0)
    df = df[df.Gene.isin(genes)]
    
    df = df.sort_values('Unique_Samples_Mut', ascending = False)
    df=df.drop(columns = ['Missense_Mut', 'Truncation_Mut'])
    if len(df) > 10:
        df = df[0:10]
    df['Cancer'] = [cancer_dict[cancer]] * len(df)
    mutation_frequency_df.append(df)
mutation_frequency_df = pd.concat(mutation_frequency_df)
mutation_frequency_df.drop(mutation_frequency_df.columns.difference(['Gene','Unique_Samples_Mut', 'Cancer']), 1, inplace=True)
mutation_frequency_df

['M' 'Intron' 'T' "3'UTR" "5'Flank" "5'UTR" 'START_CODON_SNP' 'IGR'
 'DE_NOVO_START_OUT_FRAME' 'DE_NOVO_START_IN_FRAME' 'COULD_NOT_DETERMINE'
 'Translation_Start_Site' 'START_CODON_INS']
['T' 'M' 'Intron' "3'UTR" "5'UTR" "5'Flank" 'IGR' 'START_CODON_SNP'
 'DE_NOVO_START_IN_FRAME' 'COULD_NOT_DETERMINE' 'DE_NOVO_START_OUT_FRAME'
 'Translation_Start_Site' 'START_CODON_INS']
['M' 'Intron' 'T' "5'UTR" 'START_CODON_SNP' "5'Flank" "3'UTR" 'IGR'
 'Translation_Start_Site' 'DE_NOVO_START_IN_FRAME'
 'DE_NOVO_START_OUT_FRAME' 'COULD_NOT_DETERMINE']
['M' 'T' 'Intron' "5'Flank" "3'UTR" "5'UTR" 'IGR' 'START_CODON_SNP'
 'Translation_Start_Site' 'DE_NOVO_START_IN_FRAME'
 'DE_NOVO_START_OUT_FRAME' 'START_CODON_INS' 'COULD_NOT_DETERMINE']
['M' "3'UTR" 'T' 'Intron' "5'Flank" "5'UTR" 'IGR' 'COULD_NOT_DETERMINE'
 'DE_NOVO_START_OUT_FRAME' 'START_CODON_SNP']


  mutation_frequency_df.drop(mutation_frequency_df.columns.difference(['Gene','Unique_Samples_Mut', 'Cancer']), 1, inplace=True)


Unnamed: 0,Gene,Unique_Samples_Mut,Cancer
14587,TP53,0.540541,LUAD
15045,TTN,0.378378,LUAD
4055,EGFR,0.36036,LUAD
8689,MUC16,0.351351,LUAD
7456,KRAS,0.297297,LUAD
4707,FAT3,0.234234,LUAD
2844,COL11A1,0.207207,LUAD
3198,CTNNA2,0.198198,LUAD
10148,PCLO,0.189189,LUAD
750,APOB,0.189189,LUAD


In [7]:
# write bash command for finding permutation p-val for each gene/cancer pair
header = '#!/bin/bash\n#SBATCH --time=168:00:00   # walltime\n#SBATCH --ntasks=2   # number of processor cores (i.e. tasks)\n#SBATCH --mem-per-cpu=8192M   # memory per CPU core\n#SBATCH --mail-user=humberto.giraldez@gmail.com   # email address\n#SBATCH --mail-type=BEGIN\n#SBATCH --mail-type=END\n# Set the max number of threads to use for programs using OpenMP. Should be <= ppn. Does nothing if the program doesn\'t use OpenMP.\nexport OMP_NUM_THREADS=$SLURM_CPUS_ON_NODE\n'
for index, row in mutation_frequency_df.iterrows():
    file_name = 'trans_' + row.Cancer + '_' + row.Gene + '.sh'
    with open(file_name, 'w+') as file:
        file.write(header)
        s = 'python3 transmutation_effects.py ' + row.Cancer + ' ' + row.Gene + ' 10000\n'
        file.write(s)
        file.close()
    with open ('run_transmutation_scripts.sh', 'a+') as file:
        s = 'sbatch ' + file_name +'\n'
        file.write(s)
        file.close()
