## Step 1: Library Imports

Run this cell to import the necessary libraries

In [15]:
import pandas as pd
import numpy as np
import scipy.stats
import collections
import re
#import gseapy as gp
#from gseapy.plot import barplot, dotplot

import cptac
cancer_object = cptac.Colon()
import cptac.algorithms as al

[Kding transcriptomics data.....ata....

## Step 2: Find the frequently mutated genes 

Enter the type of cancer and the cutoff for mutation frequency that you would like to use.

In [14]:
desired_cutoff = .2

fm = al.get_frequently_mutated(cancer_object, cutoff=desired_cutoff)
print(fm)

       Gene  Unique_Samples_Mut  Missence_Mut  Truncation_Mut
0    ACVR2A            0.227273      0.018182        0.227273
1       APC            0.745455      0.100000        0.727273
2    ARID1A            0.209091      0.063636        0.163636
3   CCDC168            0.263636      0.172727        0.172727
4     CSMD3            0.236364      0.200000        0.072727
5     DNAH5            0.236364      0.190909        0.090909
6      FAT3            0.254545      0.236364        0.027273
7      FAT4            0.236364      0.227273        0.027273
8     HMCN1            0.209091      0.200000        0.027273
9      KRAS            0.318182      0.318182        0.000000
10    MUC16            0.354545      0.318182        0.063636
11    MUC5B            0.245455      0.227273        0.045455
12    NCOR2            0.218182      0.172727        0.045455
13    OBSCN            0.263636      0.236364        0.063636
14     PCLO            0.218182      0.190909        0.090909
15   PIK

## Step 3: Select a gene from the list of frequently mutated genes above

Set the gene to which of the above frequently mutated genes you want to examine. For example, if you want to look at the gene PTEN, change the cell below to say "gene = 'PTEN'"

In [41]:
gene = 'ACVR2A'

## Step 4: Select desired omics comparison

Select which type of omics you want to compare. If you want to compare proteomics data for the interacting proteins, uncomment the "omics = en.get_proteomics()" line, etc.

In [42]:
omics = cancer_object.get_proteomics()
#omics = cancer_object.get_transcriptomics()
#omics = cancer_object.get_phosphoproteomics()
#omics = cancer_object.get_acetylproteomics()

## Step 5: cis comparison 

Determine if the mutated gene vs the wildtype has a significant effect on the mean of the omics data.

In [43]:
'''Create dataframe in order to do comparisons with wrap_ttest'''
omics_and_mutations = cancer_object.append_mutations_to_omics(mutation_genes=[gene], 
                            omics_df_name = omics.name, omics_genes=gene)
tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
label = tumors[[gene+'_Mutation_Status']].rename(columns={gene+"_Mutation_Status": "Label"})
filter_mutations = label.where(lambda x: x == 'Wildtype_Tumor') #na used when false
binary_mutations = filter_mutations.fillna('Mutated')

'''Format the dataframe correctly for t-test(just omics and label columns)'''
columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
omics_df = tumors.drop(columns_to_drop, axis = 1)
labeled_omics = omics_df.join(binary_mutations)

'''Make list of column omics to be compared using t-test'''
omics_col_list = list(omics_df.columns) #just omics column name (without Label column)

'''t-test comparing means of mutated vs wildtype effect on cis omics'''
print("Doing t-test comparison\n")
significant_pval_results = al.wrap_ttest(labeled_omics, 'Label', omics_col_list)
print(significant_pval_results)

ACVR2A did not match any columns in proteomics dataframe. ACVR2A_proteomics column inserted, but filled with NaN.
Doing t-test comparison

No significant comparisons.
None


## Step 6: Generate interacting proteins and test omics comparisons

Simply run this cell after following the instructions above to see the results of the comparisons

In [44]:
try:
    '''Prevent slice degrees of freedom warning'''
    import warnings
    warnings.filterwarnings("ignore")

    print("\nGene: ", gene)

    '''Use get interacting proteins method to generate list of interacting proteins'''
    interacting_proteins = al.get_interacting_proteins(gene)

    print("Generating interacting protein list")
    interacting_proteins_in_omics_df = []

    '''Only do comparisons on proteins in the omics dataframe'''
    for ip in interacting_proteins:
        if omics.name == 'phosphoproteomics' or omics.name == 'acetylproteomics':
            col_regex = ip + "-.*" # Build a regex to get all columns that match the gene
        else:
            col_regex = '^{}$'.format(ip)

        selected = omics.filter(regex=col_regex)

        if len(selected.columns) > 0:
            interacting_proteins_in_omics_df.append(ip)

    '''Ceate dataframe in order to do comparisons with wrap_ttest'''
    protdf = cancer_object.append_mutations_to_omics(mutation_genes=[gene], omics_df_name=omics.name, omics_genes=interacting_proteins_in_omics_df)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)
    print(protdf.columns)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")

    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")

except:
   print("Error in Comparison")


Gene:  ACVR2A
Generating interacting protein list
Index(['INHBA_proteomics', 'SMAD2_proteomics', 'SMAD3_proteomics',
       'SMAD4_proteomics', 'ENG_proteomics', 'SMAD1_proteomics',
       'SMAD5_proteomics', 'Label'],
      dtype='object')
Doing t-test comparisons

         Comparison   P_Value
0  SMAD4_proteomics  0.000005
1  SMAD2_proteomics  0.000729





## Step 7: Look at effect on all proteins

This will look at the effect of the selected gene mutation on all proteins, and report any significant results.

In [29]:
try:
    print("\nGene: ", gene)

    '''Use get interacting proteins method to generate list of interacting proteins'''
    proteomics = cancer_object.get_proteomics()
    proteins = proteomics.columns
    #proteins = proteins[:250]

    print("Generating protein list")
    interacting_proteins_in_omics_df = []

    '''Only do comparisons on proteins in the omics dataframe'''
    for ip in proteins:
        if omics.name == 'phosphoproteomics' or omics.name == 'acetylproteomics':
            col_regex = "^{}-.*$".format(ip) # Build a regex to get all columns that match the gene
        else:
            col_regex = '^{}$'.format(ip)

        selected = omics.filter(regex=col_regex)

        if len(selected.columns) > 0:
            interacting_proteins_in_omics_df.append(ip)

    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = cancer_object.append_mutations_to_omics(mutation_genes=[gene], omics_df_name=omics.name, omics_genes=interacting_proteins_in_omics_df)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    print(protdf.columns)
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)
    print(protdf.columns)
    
    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")

except Exception as e:
    print("Error in Comparison")
    print(e)


Gene:  MUC5B
Generating protein list


KeyboardInterrupt: 

## Gene Set Enrichment on Significant Omics Genes

In [None]:
if len(wrap_results) > 0:
    '''Get a list of the omics genes deemed significant by wrap_ttest'''
    all_gene_list = list(wrap_results['Comparison'])
    gene_name_list = []
    for agl in all_gene_list:
        split = agl.split("_")
        gene_name_list.append(split[0])
    
    '''Use the gseapy library to run a gene set enrichment analysis on the resulting list of genes'''
    enrichment = gp.enrichr(gene_list = gene_name_list, description='ARID1A_Impacted', gene_sets='KEGG_2016', outdir='test/enrichr_kegg',cutoff=.5)
    print(enrichment.res2d)
    
else:
    print("No significant comparisons found.")

## Plot Gene Set Enrichment Results

In [None]:
'''Plot the significant results of the gene set enrichment analysis'''
if len(wrap_results) > 0:
    barplot(enrichment.res2d, title=gene+ " Impacted Protein Enrichment Analysis")
else:
    print("No significant comparisons found.")