## Step 1: Library Imports

Run this cell to import the necessary libraries

In [90]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import collections
import re
import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
import cptac.algorithms as al

## Step 2: Find the frequently mutated genes 

Enter the type of cancer and the cutoff for mutation frequency that you would like to use.

In [91]:
cancer_object = cptac.Endometrial()
desired_cutoff = .2

fm = al.get_frequently_mutated(cancer_object, cutoff=desired_cutoff)
print(fm)

CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but analysis may not be
published until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter cptac.embargo() to open the webpage for more details.
     Gene  Unique_Samples_Mut  Missence_Mut  Truncation_Mut
0  ARID1A            0.452632      0.136842        0.400000
1    CTCF            0.284211      0.094737        0.242105
2  CTNNB1            0.305263      0.305263        0.000000
3   KMT2B            0.242105      0.115789        0.126316
4    KRAS            0.326316      0.326316        0.000000
5  PIK3CA            0.494737      0.484211        0.010526
6  PIK3R1            0.389474      0.231579        0.189474
7    PTEN            0.789474      0.463158        0.568421
8    TP53            0.221053      0.157895        0.073684
9   ZFHX3            0.221053    

## Step 3: Select a gene from the list of frequently mutated genes above

Set the gene to which of the above frequently mutated genes you want to examine. For example, if you want to look at the gene PTEN, change the cell below to say "gene = 'PTEN'"

In [84]:
gene = 'TTN'

## Step 4: Select desired omics comparison

Select which type of omics you want to compare. If you want to compare proteomics data for the interacting proteins, uncomment the "omics = en.get_proteomics()" line, etc.

In [85]:
omics = cancer_object.get_proteomics()
#omics = cancer_object.get_transcriptomics()
#omics = cancer_object.get_phosphoproteomics()
#omics = cancer_object.get_acetylproteomics()

## Step 5: cis comparison 

Determine if the DNA mutation has an effect on the omics measurement. In order to do this, we have a few steps in code.
1. get a table with both the omics and mutation data for tumors
2. get a binary column from the mutation data to separate our samples
3. format data frame to be used in the T-test
4. send data to the T-test.
5. visualize comparison

In [87]:
# Step 1 - Create dataframe in order to do comparisons with wrap_ttest - drop nan values?
omics_and_mutations = cancer_object.append_mutations_to_omics(mutation_genes = gene, 
                            omics_df_name = omics.name, omics_genes = gene).dropna()

# Check if values in omics data
if omics_and_mutations[gene+"_"+omics.name].empty:
    print('Not possible to do T-test.')

else:
    # Step 2 - Create the binary column needed to do the comparison
    omics_and_mutations['binary_mutations'] = np.where(
        omics_and_mutations[gene+'_Mutation_Status'] == 'Wildtype_Tumor', 'Wildtype', 'Mutated')

    # Step 3 - Format the dataframe correctly for the T-test(just omics and binary columns for tumors)
    tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor']
    columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
    omics_binary_mutations = tumors.drop(columns_to_drop, axis = 1)

    # Make a list of the column label of omics to be used in the wrap_ttest function
    omics_col_list = [omics_binary_mutations.columns[0]] 

    # Step 4 - T-test comparing means of mutated vs wildtype effect on cis omics
    print("Doing t-test comparison\n")
    significant_pval_results = al.wrap_ttest(omics_binary_mutations, 'binary_mutations', omics_col_list)
    if not significant_pval_results.empty:
        print(significant_pval_results)
    
    # Step 5 - Visualize comparison
    cis_boxplot = sns.boxplot(data = omics_binary_mutations, x = "binary_mutations",
                              y = gene+"_proteomics", order = ["Wildtype","Mutated"])  
    cis_boxplot.set_title(gene + " effect on " + omics.name + " abundance")
    cis_boxplot = sns.stripplot(data=omics_binary_mutations, x = "binary_mutations",
                                y = gene+"_proteomics",jitter = True, color = ".3", order = ["Wildtype","Mutated"])
    cis_boxplot.set(xlabel = gene + " Mutation Status in Tumors", ylabel = "Proteomics")
    plt.show()



TTN did not match any columns in proteomics dataframe. TTN_proteomics column inserted, but filled with NaN.
Not possible to do T-test.


## Step 6: Generate interacting proteins and test omics comparisons

Simply run this cell after following the instructions above to see the results of the comparisons

In [51]:
try:
    '''Prevent slice degrees of freedom warning'''
    import warnings
    warnings.filterwarnings("ignore")

    print("\nGene: ", gene)

    '''Use get interacting proteins method to generate list of interacting proteins'''
    interacting_proteins = al.get_interacting_proteins(gene)

    print("Generating interacting protein list")
    interacting_proteins_in_omics_df = []

    '''Only do comparisons on proteins in the omics dataframe'''
    for ip in interacting_proteins:
        if omics.name == 'phosphoproteomics' or omics.name == 'acetylproteomics':
            col_regex = ip + "-.*" # Build a regex to get all columns that match the gene
        else:
            col_regex = '^{}$'.format(ip)

        selected = omics.filter(regex=col_regex)

        if len(selected.columns) > 0:
            interacting_proteins_in_omics_df.append(ip)

    '''Ceate dataframe in order to do comparisons with wrap_ttest'''
    protdf = cancer_object.append_mutations_to_omics(mutation_genes=[gene], omics_df_name=omics.name, omics_genes=interacting_proteins_in_omics_df)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)
    print(protdf.columns)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")

    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")

except:
   print("Error in Comparison")


Gene:  PTEN
Generating interacting protein list
Index(['CSNK2A1_proteomics', 'PDGFRB_proteomics', 'PIK3R2_proteomics',
       'PIK3C3_proteomics', 'CSNK2A2_proteomics', 'SLC9A3R1_proteomics',
       'USP13_proteomics', 'PIK3CA_proteomics', 'TP53_proteomics',
       'EGFR_proteomics', 'PIK3CB_proteomics', 'PTK2_proteomics',
       'USP7_proteomics', 'XIAP_proteomics', 'PTEN_proteomics',
       'PIK3CD_proteomics', 'ROCK1_proteomics', 'SHC1_proteomics',
       'NEDD4_proteomics', 'INPP4B_proteomics', 'PIK3R1_proteomics',
       'AKT1_proteomics', 'MVP_proteomics', 'Label'],
      dtype='object')
Doing t-test comparisons

No significant comparisons.


## Step 7: Look at effect on all proteins

This will look at the effect of the selected gene mutation on all proteins, and report any significant results.

In [29]:
try:
    print("\nGene: ", gene)

    '''Use get interacting proteins method to generate list of interacting proteins'''
    proteomics = cancer_object.get_proteomics()
    proteins = proteomics.columns
    #proteins = proteins[:250]

    print("Generating protein list")
    interacting_proteins_in_omics_df = []

    '''Only do comparisons on proteins in the omics dataframe'''
    for ip in proteins:
        if omics.name == 'phosphoproteomics' or omics.name == 'acetylproteomics':
            col_regex = "^{}-.*$".format(ip) # Build a regex to get all columns that match the gene
        else:
            col_regex = '^{}$'.format(ip)

        selected = omics.filter(regex=col_regex)

        if len(selected.columns) > 0:
            interacting_proteins_in_omics_df.append(ip)

    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = cancer_object.append_mutations_to_omics(mutation_genes=[gene], omics_df_name=omics.name, omics_genes=interacting_proteins_in_omics_df)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    print(protdf.columns)
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)
    print(protdf.columns)
    
    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")

except Exception as e:
    print("Error in Comparison")
    print(e)


Gene:  MUC5B
Generating protein list


KeyboardInterrupt: 

## Gene Set Enrichment on Significant Omics Genes

In [None]:
if len(wrap_results) > 0:
    '''Get a list of the omics genes deemed significant by wrap_ttest'''
    all_gene_list = list(wrap_results['Comparison'])
    gene_name_list = []
    for agl in all_gene_list:
        split = agl.split("_")
        gene_name_list.append(split[0])
    
    '''Use the gseapy library to run a gene set enrichment analysis on the resulting list of genes'''
    enrichment = gp.enrichr(gene_list = gene_name_list, description='ARID1A_Impacted', gene_sets='KEGG_2016', outdir='test/enrichr_kegg',cutoff=.5)
    print(enrichment.res2d)
    
else:
    print("No significant comparisons found.")

## Plot Gene Set Enrichment Results

In [None]:
'''Plot the significant results of the gene set enrichment analysis'''
if len(wrap_results) > 0:
    barplot(enrichment.res2d, title=gene+ " Impacted Protein Enrichment Analysis")
else:
    print("No significant comparisons found.")