# Omics Cookbook

## Library Imports

Run this cell to import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import collections
import re

import CPTAC.Endometrial as en
import CPTAC.Algorithms as al

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
You have loaded the CPTAC Endometrial dataset. To view available
dataframes, use CPTAC.Endometrial.list_data(). To view available
functions for accessing and manipulating the dataframes, use
CPTAC.Endometrial.list_api().
Endometrial Data Version: 2.1

Loading Dictionary...
Loading CPTAC Endometrial data:
Loading proteomics data...
Loading clinical data...
Loading acetylproteomics data...
Loading phosphoproteomics_site data...
Loading somatic data...
Loading definitions data...
Loading transcriptomics_circular data...
Loading phosphoproteomics_gene data...
Loading transcriptomics_linear data...
Loading somatic data...
Loading miRNA data...
Loading CNA data...

 ******PLEASE READ******
CPTAC is a community resource project and da

## Get frequently mutated genes list

This will be simplified to just specify the cancer type and frequency of mutation to use for the cutoff.

In [2]:
somatic = en.get_mutations()
clinical = en.get_clinical()
prot = en.get_proteomics()
gene = 'PTEN'
omics_mutations = en.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
omics_mutations = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']

fm = al.get_frequently_mutated(somatic, omics_mutations, .15, show_percentage=True)
for fmg in fm:
    print(fmg)

ovarian
PTEN: %0.79
PIK3CA: %0.49
ARID1A: %0.45
PIK3R1: %0.39
KRAS: %0.33
CTNNB1: %0.31
CTCF: %0.28
KMT2B: %0.24
TP53: %0.22
ZFHX3: %0.22
ZFHX4: %0.20
DOCK3: %0.20
INPPL1: %0.20
FBXW7: %0.19
MUC16: %0.19
KMT2D: %0.18
HERC2: %0.18
RPL22: %0.17
RBM27: %0.17
NSD1: %0.17
SYNE1: %0.17
SCAF4: %0.17
PCLO: %0.17
JAK1: %0.17
CCDC168: %0.17
ZNF469: %0.16
LMAN1: %0.16
OBSCN: %0.16
AHNAK: %0.16
HUWE1: %0.16
DNAH17: %0.16


## Select a gene from the list of frequently mutated genes above

Set the gene to which of the above frequently mutated genes you want to examine. For example, if you want to look at the gene PTEN, change the cell below to say "gene = 'PTEN'"

In [83]:
gene = 'ARID1A'

## Select desired omics comparison

Select which type of omics you want to compare. If you want to compare proteomics data for the interacting proteins, uncomment the "omics = en.get_proteomics()" line, etc.

In [84]:
omics = en.get_proteomics()
#omics = en.get_transcriptomics()
#omics = en.get_phosphoproteomics()
#omics = en.get_acetylproteomics()

## Generate interacting proteins and test omics comparisons

Simply run this cell after following the instructions above to see the results of the comparisons

In [85]:
try:
    '''Prevent slice degrees of freedom warning'''
    import warnings
    warnings.filterwarnings("ignore")

    print("\nGene: ", gene)

    '''Use get interacting proteins method to generate list of interacting proteins'''
    interacting_proteins = al.get_interacting_proteins(gene)

    print("Generating interacting protein list")
    interacting_proteins_in_omics_df = []

    '''Only do comparisons on proteins in the omics dataframe'''
    for ip in interacting_proteins:
        if omics.name == 'phosphoproteomics' or omics.name == 'acetylproteomics':
            col_regex = ip + "-.*" # Build a regex to get all columns that match the gene
        else:
            col_regex = '^{}$'.format(ip)

        selected = omics.filter(regex=col_regex)

        if len(selected.columns) > 0:
            interacting_proteins_in_omics_df.append(ip)

    '''Ceate dataframe in order to do comparisons with wrap_ttest'''
    protdf = en.append_mutations_to_omics(mutation_genes=[gene], omics_df=omics, omics_genes=interacting_proteins_in_omics_df)
    protdf = protdf.loc[protdf['Sample_Status'] == 'Tumor']

    '''Create the binary valued column needed to do the comparison'''
    for ind, row in protdf.iterrows():
        if row[gene+"_Mutation_Status"] != 'Wildtype_Tumor':
            protdf.at[ind,'Label'] = 'Mutated'
        else:
            protdf.at[ind,'Label'] = 'Wildtype'

    '''Format the datafram correctly'''
    protdf = protdf.drop(gene+"_Mutation",axis=1)
    protdf = protdf.drop(gene+"_Location",axis=1)
    protdf = protdf.drop(gene+"_Mutation_Status", axis=1)
    protdf = protdf.drop("Sample_Status",axis=1)

    '''Make list of columns to be compared using t-tests'''
    col_list = list(protdf.columns)
    col_list.remove('Label')

    print("Doing t-test comparisons\n")

    '''Call wrap_ttest, pass in formatted dataframe'''
    wrap_results = al.wrap_ttest(protdf, 'Label', col_list)

    '''Print results, if anything significant was found'''
    if wrap_results is not None:
            print(wrap_results)
            print("\n\n")

except:
    print("Error in Comparison")


Gene:  ARID1A
Generating interacting protein list
Doing t-test comparisons

           Comparison       P_Value
0   ARID1A_proteomics  1.025018e-10
1     DPF2_proteomics  2.166609e-07
2  SMARCB1_proteomics  6.885135e-05
3  SMARCC2_proteomics  8.270473e-05
4  SMARCE1_proteomics  1.498027e-04
5  SMARCD1_proteomics  1.744915e-04
6    BCL7C_proteomics  4.379537e-04
7    CCND1_proteomics  1.499832e-03



