In [1]:
# Import blackSheepCPTACmodule and cptac data
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt, sys
sys.path.insert(0, "/Users/lili/dropbox_lili/common_functions/outlier_analysis/")
import blackSheepCPTACmodule as ol
import importlib
importlib.reload(ol)
import cptac

Welcome to the cptac data service package. To view available datasets,
enter cptac.list_data(). To access a specific data set, load the
dataset and assign it to a variable using 'cptac.NameOfDataset()',
e.g. 'en = cptac.Endometrial()'
******
Version: 0.5.0
******


In [None]:
#Binarization Functions:

def binarizeCutOff(df, column, cut_off, replace_high, replace_low):
    """Input parameters:
           df:
               The Clinical DataFrame
               
           column: 
               A column in df to binarize
           
           cut_off:
               The benchmark at which you compare the original
               samples in the dataframe. For example, you 
               may set a cut_off of 65 for the clinical['Age'] 
               column to separate Retired and Working individuals. 
           
           replace_low: 
               The value that will replace original values
               lower than cut_off in the chosen column.
           
           replace_high: 
               The value that will replace original values
               greater than cut_off in the chosen column.
       
       Method description:
           This should replace the original value in the 
           clinical dataframe with replace_low if the original
           value is lower than cut_off. Otherwise, it will
           replace the original value with replace_high if the
           original value is greater than or equal to cut_off.
           Because np.where() overwrites NaN values, they will
           be saved by Sample index, and then put back in at
           the end using pd.DataFrame.where() 
       
       Return Value:
           A one column dataframe with the specified, replaced values.
    """
    new_df = df.copy()
    final_df = df.copy()
    nulls = new_df[column].isnull()
    
    new_df[column] = np.where(new_df[column] >= cut_off, replace_high, replace_low)
    final_df[column] = new_df[column].where(~nulls, np.nan)
    
    return(final_df[column])

def binarizeRange(df, column, lower_bound, upper_bound, in_range='In_Range', out_of_range='Out_Of_Range'):
    """Input parameters:
           df:
               The Clinical DataFrame
           
           column: 
               A column in df to binarize
           
           lower_bound:
               The lowest value of a researchers's specified
               range. For example, a value of 18.5 for BMI
               could be the function's lower_bound.
           
           upper_bound: 
               The highest value of a researchers's specified
               range. For example, a value of 24.9 for BMI
               could be the function's upper_bound.
                      
           in_range (default value = 'In_Range'): 
               The value that will replace original values
               within the specified range between lower_bound
               and upper_bound.
               
           out_of_range (default value = 'Out_Of_Range'):
               The value that will replace original values
               outside of the specified range between
               lower_bound and upper_bound.
       
       Method description:
           This should replace the original value in the 
           clinical dataframe with in_range if the original
           value is within the range from lower_bound to 
           upper_bound. Otherwise, it will replace the 
           original value with out_of_range. Because np.where() 
           overwrites NaN values, they will be saved by Sample 
           index, and then put back in at the end using
           pandas.DataFrame.where().
       
       Return Value:
           The return value should be a list of values with
           two options, either in_range or out_of_range. This
           function is for columns with continuous variables, 
           such as integers or floats
    """
    new_df = df.copy()
    final_df = df.copy()
    nulls = new_df[column].isnull()

    new_df[column] = np.where((new_df[column] >= lower_bound) & 
                              (new_df[column] < upper_bound), 
                              in_range, out_of_range)
    final_df[column] = new_df[column].where(~nulls, np.nan)
    return(final_df[column])


def binarizeCategorical(df, column, dictionary):
    """
    Input Parameters:
        
        df:
            The Clinical DataFrame
            Example: df = en.get_clinical()
        
        column:
            A column in df to binarize
            Example: column = df['Race']
        
        dictionary: 
            The dictionary to use in mapping df[column], where
            keys are the responses to be replaced, and values 
            that will replace the keys.
            
            Example: 
            my_dictionary = {'White':'European', 
                             'Black or African American':'Not_European', 
                             'Not Reported':'Not_European', 
                             'Asian':'Not_European'}
            
    Method Description:
        This function takes a dataframe of clinical observations, and 
        a column containing categorical variables, and maps a dictionary 
        to replace values within that column.

    Return Value:
        A one column dataframe with the specified, replaced values.
    """
    new_df[column] = df[column].map(dictionary).fillna(np.nan)
    #new_df[column] = df[column].replace(dictionary).fillna(np.nan)
    return(new_df[column])

In [2]:
# Get data
en = cptac.Endometrial()
proteomics = en.get_proteomics()
transcriptomics = en.get_transcriptomics()
clinical = en.get_clinical()

You have loaded the cptac endometrial dataset. To view available
dataframes, call the dataset's list_data() method. To view available
functions for accessing and manipulating the dataframes, call its
list_api() method.
endometrial data version: 2.1

Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading definitions data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic_binary data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

Loading definitions...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter cptac.embarg

In [None]:
#This cell is for reference so that I don't have to keep looking up which columns
#in the clinical dataset are binary and which aren't for testing.

non_binary_columns = ['Country', 'Histologic_Grade_FIGO', 
                      'Proteomics_Tumor_Normal', 'Myometrial_invasion_Specify', 
                      'Path_Stage_Primary_Tumor-pT', 'Path_Stage_Reg_Lymph_Nodes-pN', 
                      'Clin_Stage_Dist_Mets-cM', 'Path_Stage_Dist_Mets-pM', 
                      'tumor_Stage-Pathological', 'FIGO_stage', 'BMI', 'Age', 'Race', 
                      'Ethnicity', 'Tumor_Site', 'Tumor_Site_Other', 'Tumor_Size_cm', 
                      'Num_full_term_pregnancies']

already_binary_columns = ['Treatment_naive', 'Diabetes', 'Gender', 
                          'Tumor_Focality', 'Histologic_type', 
                          'Tumor_purity', 'LVSI', ]

In [3]:
# Create subset of clinical dataset to test for trends
annotations = clinical[['Ethnicity','Diabetes', 'BMI']]

In [4]:
# Test of the make_outliers_table function to create a table of outliers
# in the transcriptomics data.
outliers = ol.make_outliers_table(transcriptomics, iqrs=2.0, 
                                  up_or_down='up', aggregate=False, 
                                  frac_table=False)

results = ol.compare_groups_outliers(outliers, annotations, frac_filter=.1)
results.head()

Testing 1462 rows for enrichment in Ethnicity Not-Hispanic or Latino samples
Testing 986 rows for enrichment in Ethnicity Not reported samples
Testing 1150 rows for enrichment in Diabetes No samples
Testing 1156 rows for enrichment in Diabetes Yes samples
Testing 1845 rows for enrichment in BMI 31.0 samples
Testing 2184 rows for enrichment in BMI 27.0 samples


Unnamed: 0,Ethnicity_Not-Hispanic or Latino_enrichment_FDR,Ethnicity_Not reported_enrichment_FDR,Diabetes_No_enrichment_FDR,Diabetes_Yes_enrichment_FDR,BMI_31.0_enrichment_FDR,BMI_27.0_enrichment_FDR
A1BG,,,,,,
A1BG-AS1,,,,,,
A1CF,,,,,,
A2M,,,,,,
A2M-AS1,,,,,,


In [57]:
#Create dictionary
my_dictionary = {'White':'European', 
              'Black or African American':'Not_European', 
              'Not Reported':'Not_European', 
              'Asian':'Not_European'}

#Using the functions to binarize various different columns
new_df = subset.copy()
BMI = binarizeRange(clinical, 'BMI', 18, 25, 'Healthy', 'Unhealthy')
age = binarizeCutOff(clinical, 'Age', 60.0, 'Old', 'Young')
race = binarizeCategorical(clinical, 'Race', my_dictionary)

new_df['Race'] = race                     
new_df['BMI'] = BMI
new_df['Age'] = age

Unnamed: 0_level_0,BMI,Age,Race
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S001,Unhealthy,Old,European
S002,Unhealthy,Young,European
S003,Unhealthy,Young,European
S005,Unhealthy,Old,European
S006,Healthy,Old,European
S007,Unhealthy,Young,European
S008,Unhealthy,Old,European
S009,Healthy,Old,European
S010,Unhealthy,Old,European
S011,Unhealthy,Young,European
