# Omics Cookbook

## Library Imports

Run this cell to import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import collections
import re

import cptac.endometrial as en
import cptac.algorithms as al

Welcome to the cptac data service package. Available datasets may be
viewed using cptac.list_data(). In order to access a specific data
set, import a cptac subfolder using either 'import cptac.dataset' or
'from cptac import dataset'.
******
Version: 0.3.2
******
You have loaded the cptac endometrial dataset. To view available
dataframes, use cptac.endometrial.list_data(). To view available
functions for accessing and manipulating the dataframes, use
cptac.endometrial.list_api().
endometrial data version: 2.1

Loading Dictionary...
Loading cptac endometrial data:
Loading proteomics data...
Loading clinical data...
Loading acetylproteomics data...
Loading phosphoproteomics_site data...
Loading somatic data...
Loading transcriptomics_circular data...
Loading phosphoproteomics_gene data...
Loading transcriptomics_linear data...
Loading miRNA data...
Loading somatic_binary data...
Loading CNA data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available

## Get frequently mutated genes list

Enter the type of cancer and the cutoff for mutation frequency that you would like to use.

In [6]:
cancer_type = "Endometrial"

desired_cutoff = .15

fm = al.get_frequently_mutated(cancer_type, cutoff=desired_cutoff)

print(fm)

       Gene  Percent Mutated
0     RPL22         0.168421
1    ARID1A         0.452632
2    PIK3CA         0.494737
3     FBXW7         0.189474
4     RBM27         0.168421
5      NSD1         0.168421
6     SYNE1         0.168421
7     ZFHX4         0.200000
8      PTEN         0.789474
9      CTCF         0.284211
10   ZNF469         0.157895
11     TP53         0.221053
12    KMT2B         0.242105
13     KRAS         0.326316
14    LMAN1         0.157895
15    SCAF4         0.168421
16   PIK3R1         0.389474
17   CTNNB1         0.305263
18     PCLO         0.168421
19    KMT2D         0.178947
20    HERC2         0.178947
21    ZFHX3         0.221053
22     JAK1         0.168421
23    OBSCN         0.157895
24    AHNAK         0.157895
25  CCDC168         0.168421
26    DOCK3         0.200000
27    MUC16         0.189474
28    HUWE1         0.157895
29   DNAH17         0.157895
30   INPPL1         0.200000


## Select a gene from the list of frequently mutated genes above

Set the gene to which of the above frequently mutated genes you want to examine. For example, if you want to look at the gene PTEN, change the cell below to say "gene = 'PTEN'"

In [7]:
gene = 'ARID1A'

## Select desired omics comparison

Select which type of omics you want to compare. If you want to compare proteomics data for the interacting proteins, uncomment the "omics = en.get_proteomics()" line, etc.

In [8]:
omics = en.get_proteomics()
#omics = en.get_transcriptomics()
#omics = en.get_phosphoproteomics()
#omics = en.get_acetylproteomics()

## Generate interacting proteins and test omics comparisons

Simply run this cell after following the instructions above to see the results of the comparisons

In [9]:
try:
    '''Prevent slice degrees of freedom warning'''
    import warnings
    warnings.filterwarnings("ignore")

    print("\nGene: ", gene)

    '''Use get interacting proteins method to generate list of interacting proteins'''
    interacting_proteins = al.get_interacting_proteins(gene)

    print("Generating interacting protein list")
    interacting_proteins_in_omics_df = []

    '''Only do comparisons on proteins in the omics dataframe'''
    for ip in interacting_proteins:
        if omics.name == 'phosphoproteomics' or omics.name == 'acetylproteomics':
            col_regex = ip + "-.*" # Build a regex to get all columns that match the gene
        else:
            col_regex = '^{}$'.format(ip)

        selected = omics.filter(regex=col_regex)

        if len(selected.columns) > 0:
            interacting_proteins_in_omics_df.append(ip)

    '''Ceate dataframe in order to do comparisons with wrap_ttest'''
    protdf = en.append_mutations_to_omics(mutation_genes=[gene], omics_df=omics, omics_genes=interacting_proteins_in_omics_df)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")

    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")

except:
    print("Error in Comparison")


Gene:  ARID1A
Generating interacting protein list
Doing t-test comparisons

           Comparison       P_Value
0   ARID1A_proteomics  1.025018e-10
1     DPF2_proteomics  2.166609e-07
2  SMARCB1_proteomics  6.885135e-05
3  SMARCC2_proteomics  8.270473e-05
4  SMARCE1_proteomics  1.498027e-04
5  SMARCD1_proteomics  1.744915e-04
6    BCL7C_proteomics  4.379537e-04
7    CCND1_proteomics  1.499832e-03



