In [1]:
import cptac
import pandas as pd
import cptac.utils as ut

In [2]:
luad = cptac.Luad()
lscc = cptac.Lscc()
hnscc = cptac.Hnscc()
en = cptac.Endometrial()
ccrcc = cptac.Ccrcc()

Checking that hnscc index is up-to-date...



                                                

In [3]:
cancer_list = [luad, lscc, hnscc, en, ccrcc]

In [4]:
cancer_dict = {lscc: 'lscc', luad: 'luad', hnscc: 'hnscc', en: 'en', ccrcc: 'ccrcc'}

In [6]:
mutation_frequency_dfs = []
for cancer in cancer_list:
    mutation_df = cancer.get_somatic_mutation()
    mutation_df = mutation_df[mutation_df.Mutation != 'Silent']
    mutation_df = mutation_df[mutation_df.Mutation != 'RNA']
    gene_df = cancer.get_transcriptomics('tumor')
    if isinstance(gene_df.columns, pd.MultiIndex):
        gene_df = gene_df.droplevel('Database_ID', axis = 1)
    mutation_frequency = []
    genes = list(pd.unique(mutation_df.Gene))
    num_mutations = []
    for gene in genes:
        num_mutated = len(pd.unique(mutation_df[mutation_df.Gene == gene].index))
        num_mutations.append(num_mutated)
        total = len(pd.unique(gene_df.index))
        frequency = num_mutated / total 
        mutation_frequency.append(frequency)
    df = pd.DataFrame({'Gene': genes, 'Mutation_Frequency': mutation_frequency, 'Num_mutated': num_mutations})
    df = df.sort_values(by= 'Mutation_Frequency', ascending = False)
    if len(df) > 10:
        df = df[0:10]
    df['cancer'] = [cancer_dict[cancer]] * len(df)
    mutation_frequency_dfs.append(df)
mutation_frequency_df = pd.concat(mutation_frequency_dfs)
mutation_frequency_df = mutation_frequency_df[mutation_frequency_df.Num_mutated >= 15]

In [7]:
len(mutation_frequency_df)

45

In [8]:
# write bash command for finding permutation p-val for each gene/cancer pair
with open('transmutation_effects_permutations.sh', 'w') as file:
    for index, row in mutation_frequency_df.iterrows():
        s = 'python supercomputer_transmutation_effects.py ' + row.cancer + ' ' + row.Gene + ' 10000\n'
        file.write(s)
    file.close()
    