## Library Imports

Run this cell to import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import collections
import re
import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac.endometrial as en
import cptac.algorithms as al

Welcome to the cptac data service package. Available datasets may be
viewed using cptac.list_data(). In order to access a specific data
set, import a cptac subfolder using either 'import cptac.dataset' or
'from cptac import dataset'.
******
Version: 0.4.1
******
You have loaded the cptac endometrial dataset. To view available
dataframes, use cptac.endometrial.list_data(). To view available
functions for accessing and manipulating the dataframes, use
cptac.endometrial.list_api().
endometrial data version: 2.1

Loading Dictionary...
Loading cptac endometrial data:
Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic_binary data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available

## Get frequently mutated genes list

Enter the type of cancer and the cutoff for mutation frequency that you would like to use.

In [None]:
cancer_type = "Endometrial"
desired_cutoff = .15

fm = al.get_frequently_mutated(cancer_type, cutoff=desired_cutoff)
print(fm)

## Select a gene from the list of frequently mutated genes above

Set the gene to which of the above frequently mutated genes you want to examine. For example, if you want to look at the gene PTEN, change the cell below to say "gene = 'PTEN'"

In [2]:
gene = 'ARID1A'

## Select desired omics comparison

Select which type of omics you want to compare. If you want to compare proteomics data for the interacting proteins, uncomment the "omics = en.get_proteomics()" line, etc.

In [3]:
omics = en.get_proteomics()
#omics = en.get_transcriptomics()
#omics = en.get_phosphoproteomics()
#omics = en.get_acetylproteomics()

## CIS Comparison 

Does the mutated gene have a significant effect on the chosen omics data compared to the wildtype?

In [4]:
'''Create dataframe to do comparisons with wrap_ttest'''
protdf = en.append_mutations_to_omics(mutation_genes = [gene], omics_df = omics, omics_genes = gene)
protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

'''Create the binary valued column needed to do the comparison'''
for ind, row in protdf.iterrows():
    if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
        protdf.at[ind,'Label'] = 'Mutated'
    else:
        protdf.at[ind,'Label'] = 'Wildtype'
      
'''Format the dataframe correctly'''
protdf = protdf.drop(gene+"_Mutation",axis=1)
protdf = protdf.drop(gene+"_Location",axis=1)
protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
protdf = protdf.drop("Sample_Status",axis=1)

'''Make list of gene omics column to be compared using t-tests'''
col_list = list(protdf.columns)
col_list.remove('Label')


'''ttest comparing means of mutated vs wildtype effect on omics'''
print("Doing t-test comparison\n")
gene_pval_df = al.wrap_ttest(protdf, 'Label', col_list)

'''Print p-val results of ttest'''
if gene_pval_df is None:
    print(gene_pval_df)
    
else:
    print('Mutation of', gene, 'has a significant effect on', omics.name, 'of', gene, '\n')
    print(gene_pval_df)

Doing t-test comparison

Mutation of ARID1A has a significant effect on proteomics of ARID1A 

          Comparison       P_Value
0  ARID1A_proteomics  1.025018e-10


## Generate interacting proteins and test omics comparisons

Simply run this cell after following the instructions above to see the results of the comparisons

In [None]:
try:
    '''Prevent slice degrees of freedom warning'''
    import warnings
    warnings.filterwarnings("ignore")

    print("\nGene: ", gene)

    '''Use get interacting proteins method to generate list of interacting proteins'''
    interacting_proteins = al.get_interacting_proteins(gene)

    print("Generating interacting protein list")
    interacting_proteins_in_omics_df = []

    '''Only do comparisons on proteins in the omics dataframe'''
    for ip in interacting_proteins:
        if omics.name == 'phosphoproteomics' or omics.name == 'acetylproteomics':
            col_regex = ip + "-.*" # Build a regex to get all columns that match the gene
        else:
            col_regex = '^{}$'.format(ip)

        selected = omics.filter(regex=col_regex)

        if len(selected.columns) > 0:
            interacting_proteins_in_omics_df.append(ip)

    '''Ceate dataframe in order to do comparisons with wrap_ttest'''
    protdf = en.append_mutations_to_omics(mutation_genes=[gene], omics_df=omics, omics_genes=interacting_proteins_in_omics_df)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)
    print(protdf.columns)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")

    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")

except:
    print("Error in Comparison")

## Look at effect on all proteins

This will look at the effect of the selected gene mutation on all proteins, and report any significant results.

In [None]:
try:
    print("\nGene: ", gene)

    '''Use get interacting proteins method to generate list of interacting proteins'''
    proteomics = en.get_proteomics()
    proteins = proteomics.columns
    #proteins = proteins[:250]

    print("Generating protein list")
    interacting_proteins_in_omics_df = []

    '''Only do comparisons on proteins in the omics dataframe'''
    for ip in proteins:
        if omics.name == 'phosphoproteomics' or omics.name == 'acetylproteomics':
            col_regex = "^{}-.*$".format(ip) # Build a regex to get all columns that match the gene
        else:
            col_regex = '^{}$'.format(ip)

        selected = omics.filter(regex=col_regex)

        if len(selected.columns) > 0:
            interacting_proteins_in_omics_df.append(ip)

    '''Create dataframe in order to do comparisons with wrap_ttest'''
    protdf = en.append_mutations_to_omics(mutation_genes=[gene], omics_df=omics, omics_genes=interacting_proteins_in_omics_df)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']
    print(protdf.columns)
    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)
    print(protdf.columns)
    
    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")
    
    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")

except Exception as e:
    print("Error in Comparison")
    print(e)

## Gene Set Enrichment on Significant Omics Genes

In [None]:
if len(wrap_results) > 0:
    '''Get a list of the omics genes deemed significant by wrap_ttest'''
    all_gene_list = list(wrap_results['Comparison'])
    gene_name_list = []
    for agl in all_gene_list:
        split = agl.split("_")
        gene_name_list.append(split[0])
    
    '''Use the gseapy library to run a gene set enrichment analysis on the resulting list of genes'''
    enrichment = gp.enrichr(gene_list = gene_name_list, description='ARID1A_Impacted', gene_sets='KEGG_2016', outdir='test/enrichr_kegg',cutoff=.5)
    print(enrichment.res2d)
    
else:
    print("No significant comparisons found.")

## Plot Gene Set Enrichment Results

In [None]:
'''Plot the significant results of the gene set enrichment analysis'''
if len(wrap_results) > 0:
    barplot(enrichment.res2d, title=gene+ " Impacted Protein Enrichment Analysis")
else:
    print("No significant comparisons found.")