In [1]:
import cptac
import pandas as pd
import cptac.utils as ut

In [2]:
luad = cptac.Luad()
lscc = cptac.Lscc()
hnscc = cptac.Hnscc()
en = cptac.Endometrial()
ccrcc = cptac.Ccrcc()

Checking that hnscc index is up-to-date...



                                                

In [3]:
cancer_list = [luad, lscc, hnscc, en, ccrcc]

In [4]:
cancer_dict = {lscc: 'LSCC', luad: 'LUAD', hnscc: 'HNSCC', en: 'Endometrial', ccrcc: 'CCRCC'}

In [5]:
mutation_frequency_df = []
for cancer in cancer_list:
    mutation_df = cancer.get_somatic_mutation()
    mutation_df = mutation_df[mutation_df.Mutation != 'Silent']
    mutation_df = mutation_df[mutation_df.Mutation != 'RNA']
    mutation_df = mutation_df[mutation_df.Mutation != 'synonymous SNV']
    #look only at gene with both transciptomic and proteomic data
    gene_df = cancer.get_transcriptomics('tumor')
    if isinstance(gene_df.columns, pd.MultiIndex):
        gene_df = gene_df.droplevel('Database_ID', axis = 1)
    prot_df = cancer.get_proteomics('tumor')
    if isinstance(prot_df.columns, pd.MultiIndex):
        prot_df = prot_df.droplevel('Database_ID', axis = 1)
    gene_df['Patient_ID'] = gene_df.index
    prot_df['Patient_ID'] = prot_df.index
    gene_df = gene_df.melt(id_vars = 'Patient_ID',
                           var_name ='Gene',
                           value_name = 'Transcriptomics')
    prot_df = prot_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomics')
    omics_df = pd.merge(gene_df, prot_df, how = 'outer')
    omics_df = omics_df.dropna()
    mutation_df = mutation_df.reset_index()
    df = pd.merge(omics_df, mutation_df, how = 'outer')
    df = df.drop(columns = 'Location')
    df['Mutation'] = df.Mutation.notnull()
    df = df.dropna()
    #filter to only include genes with a minimum number of samples
    genes = df.groupby(['Mutation', 'Gene'])
    sample_count = genes.size()
    cutoff = 15
    if cancer == en:
        cutoff = 10
    sample_count = sample_count[sample_count>=cutoff]
    sample_count
    mutated_genes = set(sample_count[sample_count.index.get_level_values(0) == False].index.get_level_values('Gene'))
    wt_genes = set(sample_count[sample_count.index.get_level_values(0) == True].index.get_level_values('Gene'))
    genes = wt_genes & mutated_genes
    #get mutation frequency 
    df = ut.get_frequently_mutated(cancer, 0)
    df = df[df.Gene.isin(genes)]
    df = df.sort_values('Unique_Samples_Mut', ascending = False)
    df=df.drop(columns = ['Missense_Mut', 'Truncation_Mut'])
    if len(df) > 10:
        df = df[0:10]
    df['Cancer'] = [cancer_dict[cancer]] * len(df)
    mutation_frequency_df.append(df)
mutation_frequency_df = pd.concat(mutation_frequency_df)
mutation_frequency_df

Name,Gene,Unique_Samples_Mut,Cancer
9409,TP53,0.536364,LUAD
2622,EGFR,0.345455,LUAD
5580,MUC16,0.336364,LUAD
9680,TTN,0.318182,LUAD
7886,RYR2,0.318182,LUAD
5011,LRP1B,0.309091,LUAD
4711,KRAS,0.3,LUAD
10204,ZFHX4,0.236364,LUAD
3045,FAT3,0.209091,LUAD
1880,COL11A1,0.2,LUAD


In [6]:
# write bash command for finding permutation p-val for each gene/cancer pair
header = '#!/bin/bash\n#SBATCH --time=168:00:00   # walltime\n#SBATCH --ntasks=2   # number of processor cores (i.e. tasks)\n#SBATCH --mem-per-cpu=8192M   # memory per CPU core\n#SBATCH --mail-user=nanelbarton@gmail.com   # email address\n#SBATCH --mail-type=BEGIN\n#SBATCH --mail-type=END\n# Set the max number of threads to use for programs using OpenMP. Should be <= ppn. Does nothing if the program doesn\'t use OpenMP.\nexport OMP_NUM_THREADS=$SLURM_CPUS_ON_NODE\n'
for index, row in mutation_frequency_df.iterrows():
    file_name = 'trans_' + row.Cancer + '_' + row.Gene + '.sh'
    with open(file_name, 'w+') as file:
        file.write(header)
        s = 'python3 transmutation_effects.py ' + row.Cancer + ' ' + row.Gene + ' 10000\n'
        file.write(s)
        file.close()
    with open ('run_transmutation_scripts.sh', 'a+') as file:
        s = 'sbatch ' + file_name +'\n'
        file.write(s)
        file.close()
