In [11]:
# Import blackSheepCPTACmodule and cptac data
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt, sys
sys.path.insert(0, "/Users/lili/dropbox_lili/common_functions/outlier_analysis/")
import blackSheepCPTACmodule as ol
import importlib
importlib.reload(ol)
import cptac

In [30]:
# Get data
en = cptac.Endometrial()
proteomics = en.get_proteomics()
transcriptomics = en.get_transcriptomics()
clinical = en.get_clinical()

You have loaded the cptac endometrial dataset. To view available
dataframes, call the dataset's list_data() method. To view available
functions for accessing and manipulating the dataframes, call its
list_api() method.
endometrial data version: 2.1

Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading definitions data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic_binary data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

Loading definitions...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter cptac.embarg

In [None]:
# Check to see what options are in the clinical dataset
print(clinical.columns)

In [6]:
# Create subset of clinical dataset to test for trends
annotations = clinical[['Ethnicity','Diabetes', 'BMI']]

In [10]:
# Test of the make_outliers_table function to create a table of outliers
# in the transcriptomics data.
outliers = ol.make_outliers_table(transcriptomics, iqrs=2.0, 
                                  up_or_down='up', aggregate=False, 
                                  frac_table=False)

results = ol.compare_groups_outliers(outliers, annotations, frac_filter=.1)
results.head()

Testing 1462 rows for enrichment in Ethnicity Not-Hispanic or Latino samples
Testing 986 rows for enrichment in Ethnicity Not reported samples
Testing 1150 rows for enrichment in Diabetes No samples
Testing 1156 rows for enrichment in Diabetes Yes samples
Testing 1845 rows for enrichment in BMI 31.0 samples
Testing 2184 rows for enrichment in BMI 27.0 samples


Unnamed: 0,Ethnicity_Not-Hispanic or Latino_enrichment_FDR,Ethnicity_Not reported_enrichment_FDR,Diabetes_No_enrichment_FDR,Diabetes_Yes_enrichment_FDR,BMI_31.0_enrichment_FDR,BMI_27.0_enrichment_FDR
A1BG,,,,,,
A1BG-AS1,,,,,,
A1CF,,,,,,
A2M,,,,,,
A2M-AS1,,,,,,


In [None]:
#This cell is for reference so that I don't have to keep looking up which columns
#are binary and which aren't for testing.
non_binary_columns = ['Country', 'Histologic_Grade_FIGO', 
                      'Proteomics_Tumor_Normal', 'Myometrial_invasion_Specify', 
                      'Path_Stage_Primary_Tumor-pT', 'Path_Stage_Reg_Lymph_Nodes-pN', 
                      'Clin_Stage_Dist_Mets-cM', 'Path_Stage_Dist_Mets-pM', 
                      'tumor_Stage-Pathological', 'FIGO_stage', 'BMI', 'Age', 'Race', 
                      'Ethnicity', 'Tumor_Site', 'Tumor_Site_Other', 'Tumor_Size_cm', 
                      'Num_full_term_pregnancies']

already_binary_columns = ['Treatment_naive', 'Diabetes', 'Gender', 
                          'Tumor_Focality', 'Histologic_type', 
                          'Tumor_purity', 'LVSI', ]

In [24]:
#Binarization Functions:
#These are potentially overcomplicated, and don't return
#data frames. Try doing this in a more pandas-centric way
#and return some data frames.

def binarizeCutOff(column, cut_off, replace_low, replace_high):
    """Input parameters:
           
           column: 
               A column in the clinical dataframe
           
           cut_off:
               The bar at which you compare the original
               samples in the dataframe. For example, you 
               may set a cut_off of 65 for the clinical['Age'] 
               column to separate Retired and Working individuals. 
           
           replace_low: 
               The value that will replace original values
               lower than cut_off in the chosen column.
           
           replace_high: 
               The value that will replace original values
               greater than cut_off in the chosen column.
       
       Method description:
           This should replace the original value in the 
           clinical dataframe with replace_low if the original
           value is lower than cut_off. Otherwise, it will
           replace the original value with replace_high if the
           original value is greater than or equal to cut_off.
           Otherwise, it will not change the original value. 
           Theoretically, it should only return the unchanged,
           original value if that value is 'NaN'.
       
       Return Value:
           The return value should be a list of values with two
           options, either replace_high or replace_low. This
           function is for columns with continuous variables, 
           such as integers or floats
    """
    return [ replace_low if (x < cut_off) else replace_high if x >= cut_off else x for x in column ]

def binarizeRange(column, low_cut_off, high_cut_off, in_range='In_Range', out_of_range='Out_Of_Range'):
    """Input parameters:
           
           column: 
               A column in the clinical dataframe
           
           low_cut_off:
               The lowest value of a researchers's specified
               range. For example, a value of 18.5 for BMI
               could be the function's low_cut_off.
           
           high_cut_off: 
               The highest value of a researchers's specified
               range. For example, a value of 24.9 for BMI
               could be the function's high_cut_off.
                      
           in_range: 
               The value that will replace original values
               within the specified range between low_cut_off
               and high_cut_off.
               
           out_of_range:
               The value that will replace original values
               outside of the specified range between
               low_cut_off and high_cut_off.
       
       Method description:
           This should replace the original value in the 
           clinical dataframe with in_range if the original
           value is within the range from low_cut_off to 
           high_cut_off. Otherwise, it will replace the 
           original value with out_of_range if the original 
           value is not within that range. Otherwise, it 
           will not change the original value. It should only 
           return the unchanged, original value if it is 'NaN'.
       
       Return Value:
           The return value should be a list of values with
           two options, either in_range or out_of_range. This
           function is for columns with continuous variables, 
           such as integers or floats
    """
    return [ in_range if (x >= low_cut_off and x <= high_cut_off) else out_of_range  
            if (x < low_cut_off or x > high_cut_off) else x for x in column ]

def binarizeCategorical(column, option1, option1_list, option2, option2_list):
    """Input parameters:
           
           column: 
               A column in the clinical dataframe
           
           option1:
               This is the value that will replace original
               values that are within your option1_list.
           
           option1_list: 
               This is the list of values that the original
               value will be compared against. If that value
               is within option1_list, it will be replaced
               by option1.
           
           option2:
               This is the value that will replace original
               values that are within your option2_list.
           
           option2_list: 
               This is the list of values that the original
               value will be compared against. If that value
               is within option2_list, it will be replaced
               by option2.
       
       Method description:
           This function is used for separating columns with 
           multiple categorical values, such as 'Tumor_Site_Other',
           into two options, into one of two user-defined groups.
           
           It should replace the original value in the 
           clinical dataframe with option1 if the original
           value is found in option1_list. Otherwise, it will
           replace the original value with option2 if the
           original value in option2_list. Otherwise, it will 
           not change the original value. It should only return 
           the unchanged, original value if that value is 'NaN'.
       
       Return Value:
           The return value should be a list of values with two
           options, either option1 or option2. This function is 
           for columns with categorical variables, such as 
           clinical['Histologic_grade_FIGO'] or clinical['Race'].
    """
    return [ option1 if (x in option1_list) else option2 if (x in option2_list) else x for x in column ]

In [70]:
# Choose columns to test
subset = clinical[['BMI', 'Age', 'Race']]

#Using the functions to binarize various different columns
#This is really complicated. Maybe I should just do the dictionary we started with
new_df = subset.copy()
BMI = binarizeRange(subset['BMI'], 18.5, 25)
age = binarizeCutOff(subset['Age'], 50, 'Young', 'Old')
race = binarizeCategorical(subset['Race'], 'European', ['White'], 'Not_European', 
                           ['Black or African American', 'Not Reported', 'Asian'])
                     
new_df['BMI'] = BMI
new_df['Age'] = age
new_df['Race'] = race
new_df.head()

Unnamed: 0_level_0,BMI,Age,Race
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S001,Out_Of_Range,Old,European
S002,Out_Of_Range,Old,European
S003,Out_Of_Range,Old,European
S005,Out_Of_Range,Old,European
S006,In_Range,Old,European


In [75]:
#This is my attempt at a more pandas-centric method for the functions.
#So far, it writes over the NaNs, when it shouldn't be.
binary_columns = clinical[['Race', 'Age', 'BMI']]

new_df = binary_columns.copy()

#Cutoffs

'''
This mostly works, but struggles with np.nan
'''

#print(new_df['Age'].value_counts())
new_df['Age'] = np.where(new_df['Age'] >= 60.0, 'Old', 'Young')
#new_df.loc[new_df['Age'] < 60.0] = 1
#new_df.loc[new_df['Age'] == 0] = 'Old'
#new_df.loc[new_df['Age'] == 1] = 'Young'
print(new_df['Age'].value_counts())


#Range
#new_df.loc[df['First Season'] > 1990 & df['First Season'] < 2003, 'First Season'] = 1
#df[(df < 0) & (df >= -0.2)] = 0

'''
This mostly works, but struggles with np.nan
new_df = binary_columns['BMI'].copy()
new_df[(new_df >= 18) & (new_df < 25)] = 'Healthy_Range'
new_df[(new_df == np.nan)] = np.nan
new_df[(new_df != 'Healthy_Range') & (new_df != np.nan)] = 'Unhealthy_Range'
print(new_df)
'''
#Categorical
#allowed_vals = ['White']

#new_df.loc[~new_df['Race'].isin(allowed_vals), 'Race'] = 'Not_European'
#new_df.loc[new_df['Race'].isin(allowed_vals), 'Race'] = 'European'
#print(new_df['Race'].value_counts())

Young    80
Old      64
Name: Age, dtype: int64


"\nThis mostly works, but struggles with np.nan\nnew_df = binary_columns['BMI'].copy()\nnew_df[(new_df >= 18) & (new_df < 25)] = 'Healthy_Range'\nnew_df[(new_df == np.nan)] = np.nan\nnew_df[(new_df != 'Healthy_Range') & (new_df != np.nan)] = 'Unhealthy_Range'\nprint(new_df)\n"