In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
#sys.path.append('C:\\Users\\brittany henderson\\GitHub\\GBM_for_CPTAC\\')
#import cis_functions as f

import cptac
import cptac.utils as u


In [2]:
def add_significance_col(results_df, num_comparisons):
    "bonferroni multiple hypothesis"""
    alpha = .05
    bonferroni_cutoff = alpha / num_comparisons
    
    pval = results_df['P_Value']
    if float(pval[0]) <= bonferroni_cutoff:
        results_df['Significant'] = True
    else: 
        results_df['Significant'] = False
    return results_df

def wrap_ttest_return_all(df, label_column, comparison_columns, total_tests, alpha=.05):
    try:
        #Verify precondition that label column exists and has exactly 2 unique values
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print("Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.")
            return None
        
        #Partition dataframe into two sets, one for each of the two unique values from the label column
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]
        
        #Determine the number of real valued columns on which we will do t-tests
        #sites = len(comparison_columns.columns)
        number_of_comparisons = total_tests # ? phospho sites or num freq mut genes doing cis comp
        
        #Use a bonferroni correction to adjust for multiple testing by altering the p-value needed for acceptance
        bonferroni_cutoff = alpha/number_of_comparisons
        
        #Store all comparisons with their p-values in a dictionary
        all_comparisons = {}
        
        #Loop through each comparison column, perform the t-test, and determine whether it meets the significance cutoff'''
        for column in comparison_columns:
            stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
            all_comparisons[column] = pval
    
        #Sort dictionary to list smallest p-values first
        sorted_comparisons = sorted(all_comparisons.items(), key=lambda kv: kv[1])
        #Format as a dataframe and return to caller
        all_comparisons_df = pd.DataFrame.from_dict(sorted_comparisons)
        all_comparisons_df.columns = ['Comparison', 'P_Value']
        
                                               
        all_comparisons_sig_col = add_significance_col(all_comparisons_df, number_of_comparisons)
        return all_comparisons_sig_col
                                
    except:
        print("Incorrectly Formatted Dataframe!")
        return None


In [7]:
#cptac.download(dataset='ccrcc', version='0.0')
brain= cptac.Gbm()
desired_cutoff = 0.05
gene = 'RB1'

                                    

In [8]:
brain.list_data()

Below are the dataframes contained in this dataset:
	acetylproteomics
		Dimensions: (109, 18767)
	circular_RNA
		Dimensions: (99, 3670)
	clinical
		Dimensions: (115, 28)
	CNV
		Dimensions: (98, 19907)
	experimental_design
		Dimensions: (115, 8)
	gene_fusion
		Dimensions: (2090, 8)
	lipidomics
		Dimensions: (88, 582)
	metabolomics
		Dimensions: (87, 134)
	miRNA
		Dimensions: (87, 2883)
	phosphoproteomics
		Dimensions: (109, 101266)
	proteomics
		Dimensions: (109, 11141)
	somatic_mutation
		Dimensions: (5774, 3)
	transcriptomics
		Dimensions: (108, 60483)


In [17]:
clin_and_prot = brain.join_metadata_to_omics(metadata_df_name="clinical", omics_df_name="proteomics")
clin_and_prot



Name,Patient_ID,Sample_Tumor_Normal,age,gender,height,weight,bmi,country_of_origin,race,ethnicity,...,ZSCAN31_proteomics,ZSWIM8_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S001,C3L-00104,Tumor,58.0,Male,188.00,115.00,32.54,United States,White,Not-Hispanic or Latino,...,,-0.047437,-0.105908,-0.347076,,0.459635,0.079452,-0.784983,-0.488441,0.167990
S002,C3L-00365,Tumor,59.0,Female,162.00,54.00,20.61,United States,White,Not-Hispanic or Latino,...,,0.161975,-0.213093,0.235571,,0.107421,0.048724,0.138403,-0.290141,0.405037
S003,C3L-00674,Tumor,45.0,Male,193.00,102.00,27.44,,White,Not-Hispanic or Latino,...,,-0.065534,-0.306717,0.879991,,0.883564,-0.172222,0.011876,-0.131889,-0.503581
S004,C3L-00677,Tumor,69.0,Female,164.00,52.00,19.32,,White,Not-Hispanic or Latino,...,-0.062127,-0.254535,0.463653,0.580230,0.503044,-0.604986,0.178077,-0.720059,-0.150197,-0.268715
S005,C3L-01040,Tumor,77.0,Female,170.00,70.00,24.22,Russia,,,...,,-0.092502,0.010639,-0.465079,,-0.500083,0.112651,1.004660,-0.230304,-0.102416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S111,PT-RN5K,Normal,56.0,Female,162.56,49.90,18.88,,White,Not Hispanic or Latino,...,,0.298664,-0.403930,-0.482597,0.240966,,0.609314,-1.293464,0.225910,-0.013637
S112,PT-RU72,Normal,59.0,Female,162.56,63.05,23.86,,Asian,,...,,0.155988,0.002551,-0.834434,,-0.073908,0.637394,-0.795886,0.326046,-0.217014
S113,PT-UTHO,Normal,68.0,Male,182.88,62.87,18.80,,White,,...,,0.016857,-0.171858,-0.462233,,1.278683,0.616285,-0.734952,0.339283,-0.419138
S114,PT-WVLH,Normal,58.0,Male,182.88,90.72,27.12,,White,,...,0.450926,0.114686,-0.245149,-1.202774,-0.709850,-0.233022,0.732273,-1.158899,0.354608,-0.468881


In [22]:
#Read in files with TCGA subtypes
subtypes = pd.read_csv("/Users/Lindsey/Downloads/gbm_all_subtype_collections.2019-11-13.tsv", sep= "\t")
case_subtype = subtypes[['case','rna_wang_cancer_cell_2017']] #only need subtype and case
case_subtype = case_subtype.rename(columns = {"rna_wang_cancer_cell_2017": "TCGA_subtype", "case":"Patient_ID"})
case_subtype.head()

Unnamed: 0,Patient_ID,TCGA_subtype
0,C3L-00104,Proneural
1,C3L-00365,Classical
2,C3L-00674,Mesenchymal
3,C3L-00677,Proneural
4,C3L-01040,Classical


In [26]:
# merge tgca subtypes with proteomics and clincal df
prot_subtype= clin_and_prot.merge(case_subtype, on='Patient_ID')
prot_subtype = prot_subtype.drop([,3:8])
prot_subtype.head()

SyntaxError: invalid syntax (<ipython-input-26-287568225538>, line 3)