In [11]:
# Import blackSheepCPTACmodule and cptac data
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt, sys
sys.path.insert(0, "/Users/lili/dropbox_lili/common_functions/outlier_analysis/")
import blackSheepCPTACmodule as ol
import importlib
importlib.reload(ol)
import cptac

In [12]:
# Get data
en = cptac.Endometrial()
transcriptomics = en.get_transcriptomics()
clinical = en.get_clinical()

You have loaded the cptac endometrial dataset. To view available
dataframes, call the dataset's list_data() method. To view available
functions for accessing and manipulating the dataframes, call its
list_api() method.
endometrial data version: 2.1

Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading definitions data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic_binary data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

Loading definitions...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter cptac.embarg

In [None]:
# Check to see what options are in the clinical dataset
print(clinical.columns)

In [6]:
# Create subset of clinical dataset to test for trends
annotations = clinical[['Ethnicity','Diabetes', 'BMI']]

In [10]:
# Test of the make_outliers_table function to create a table of outliers
# in the transcriptomics data.
outliers = ol.make_outliers_table(transcriptomics, iqrs=2.0, 
                                  up_or_down='up', aggregate=False, 
                                  frac_table=False)

results = ol.compare_groups_outliers(outliers, annotations, frac_filter=.1)
results.head()

Testing 1462 rows for enrichment in Ethnicity Not-Hispanic or Latino samples
Testing 986 rows for enrichment in Ethnicity Not reported samples
Testing 1150 rows for enrichment in Diabetes No samples
Testing 1156 rows for enrichment in Diabetes Yes samples
Testing 1845 rows for enrichment in BMI 31.0 samples
Testing 2184 rows for enrichment in BMI 27.0 samples


Unnamed: 0,Ethnicity_Not-Hispanic or Latino_enrichment_FDR,Ethnicity_Not reported_enrichment_FDR,Diabetes_No_enrichment_FDR,Diabetes_Yes_enrichment_FDR,BMI_31.0_enrichment_FDR,BMI_27.0_enrichment_FDR
A1BG,,,,,,
A1BG-AS1,,,,,,
A1CF,,,,,,
A2M,,,,,,
A2M-AS1,,,,,,


In [None]:
non_binary_columns = ['Country', 'Histologic_Grade_FIGO', 
                      'Proteomics_Tumor_Normal', 'Myometrial_invasion_Specify', 
                      'Path_Stage_Primary_Tumor-pT', 'Path_Stage_Reg_Lymph_Nodes-pN', 
                      'Clin_Stage_Dist_Mets-cM', 'Path_Stage_Dist_Mets-pM', 
                      'tumor_Stage-Pathological', 'FIGO_stage', 'BMI', 'Age', 'Race', 
                      'Ethnicity', 'Tumor_Site', 'Tumor_Site_Other', 'Tumor_Size_cm', 
                      'Num_full_term_pregnancies']
left_to_test = non_binary_columns

In [6]:
already_binary_columns = ['Treatment_naive', 'Diabetes', 'Gender', 
                          'Tumor_Focality', 'Histologic_type', 
                          'Tumor_purity', 'LVSI', ]


In [8]:
#Binarization Functions:

def binarizeCutOff(column, cut_off, replace_low, replace_high):
    """Input parameters:
           
           column: 
               A column in the clinical dataframe
           
           cut_off:
               The bar at which you compare the original
               samples in the dataframe. For example, you 
               may set a cut_off of 65 for the clinical['Age'] 
               column to separate Retired and Working individuals. 
           
           replace_low: 
               The value that will replace original values
               lower than cut_off in the chosen column.
           
           replace_high: 
               The value that will replace original values
               greater than cut_off in the chosen column.
       
       Method description:
           This should replace the original value in the 
           clinical dataframe with replace_low if the original
           value is lower than cut_off. Otherwise, it will
           replace the original value with replace_high if the
           original value is greater than or equal to cut_off.
           Otherwise, it will not change the original value. 
           Theoretically, it should only return the unchanged,
           original value if that value is 'NaN'.
       
       Return Value:
           The return value should be a list of values with two
           options, either replace_high or replace_low. This
           function is for columns with continuous variables, 
           such as integers or floats
    """
    return [ replace_low if (x < cut_off) else replace_high if x >= cut_off else x for x in my_list ]

def binarizeRange(column, low_cut_off, high_cut_off, in_range='In_Range', out_of_range='Out_Of_Range'):
    """Input parameters:
           
           column: 
               A column in the clinical dataframe
           
           low_cut_off:
               The lowest value of a researchers's specified
               range. For example, a value of 18.5 for BMI
               could be the function's low_cut_off.
           
           high_cut_off: 
               The highest value of a researchers's specified
               range. For example, a value of 24.9 for BMI
               could be the function's high_cut_off.
                      
           in_range: 
               The value that will replace original values
               within the specified range between low_cut_off
               and high_cut_off.
               
           out_of_range:
               The value that will replace original values
               outside of the specified range between
               low_cut_off and high_cut_off.
       
       Method description:
           This should replace the original value in the 
           clinical dataframe with in_range if the original
           value is within the range from low_cut_off to 
           high_cut_off. Otherwise, it will replace the 
           original value with out_of_range if the original 
           value is not within that range. Otherwise, it 
           will not change the original value. It should only 
           return the unchanged, original value if it is 'NaN'.
       
       Return Value:
           The return value should be a list of values with
           two options, either in_range or out_of_range. This
           function is for columns with continuous variables, 
           such as integers or floats
    """
    return [ in_range if (x >= low_bar and x <= high_bar) else out_of_range  
            if (x < low_bar or x > high_bar) else x for x in column ]

def binarizeCategorical(column, option1, option1_list, option2, option2_list):
    """Input parameters:
           
           column: 
               A column in the clinical dataframe
           
           option1:
               This is the value that will replace original
               values that are within your option1_list.
           
           option1_list: 
               This is the list of values that the original
               value will be compared against. If that value
               is within option1_list, it will be replaced
               by option1.
           
           option2:
               This is the value that will replace original
               values that are within your option2_list.
           
           option2_list: 
               This is the list of values that the original
               value will be compared against. If that value
               is within option2_list, it will be replaced
               by option2.
       
       Method description:
           This function is used for separating columns with 
           multiple categorical values, such as 'Tumor_Site_Other',
           into two options, into one of two user-defined groups.
           
           It should replace the original value in the 
           clinical dataframe with option1 if the original
           value is found in option1_list. Otherwise, it will
           replace the original value with option2 if the
           original value in option2_list. Otherwise, it will 
           not change the original value. It should only return 
           the unchanged, original value if that value is 'NaN'.
       
       Return Value:
           The return value should be a list of values with two
           options, either option1 or option2. This function is 
           for columns with categorical variables, such as 
           clinical['Histologic_grade_FIGO'] or clinical['Race'].
    """
    return [ option1 if (x in option1_list) else option2 if (x in option2_list) else x for x in column ]

"\nBMI: new_df['BMI'] = binarizeRange(binary_columns['BMI'], 18.5, 2)\nAge: new_df['Age'] = binarizeCutOff(binary_columns['Age'], 50 , 'Young', 'Old')\nRace: new_df['Race'] = binarizeCategorical(binary_columns['Race'], 'European', ['White'], 'Not_European', ['Black or African American', 'Not Reported', 'Asian'])\nDiabetes: new_df['Diabetes'] = binarizeCategorical(binary_columns['Diabetes'], 'Diabetic', ['Yes'], 'Not_Diabetic', ['No'])\nHistologic_Grade_FIGO: new_df['Histologic_Grade_FIGO'] = binarizeCategoricalV2(binary_columns['Histologic_Grade_FIGO'], 'Grade1', ['FIGO Grade 1'], 'Not_Grade1', ['FIGO Grade 2', 'FIGO Grade 3]\n"

In [None]:
# Example Uses for the functions one at a time

BMI: new_df['BMI'] = binarizeRange(binary_columns['BMI'], 18.5, 24.9)
Age: new_df['Age'] = binarizeCutOff(binary_columns['Age'], 50 , 'Young', 'Old')
Race: new_df['Race'] = binarizeCategorical(binary_columns['Race'], 'European', ['White'], 'Not_European', ['Black or African American', 'Not Reported', 'Asian'])
Diabetes: new_df['Diabetes'] = binarizeCategorical(binary_columns['Diabetes'], 'Diabetic', ['Yes'], 'Not_Diabetic', ['No'])
Histologic_Grade_FIGO: new_df['Histologic_Grade_FIGO'] = binarizeCategoricalV2(binary_columns['Histologic_Grade_FIGO'], 'Grade1', ['FIGO Grade 1'], 'Not_Grade1', ['FIGO Grade 2', 'FIGO Grade 3]


In [9]:
# Choose columns to test
binary_columns = clinical[['BMI', 'Age', 'Race', 'Diabetes']]

Using the functions to binarize various different columns
new_df = binary_columns.copy()
new_df = new_df.assign(**{'BMI':binarizeRange(binary_columns['BMI'], 18.5, 25), 
                          'Age':binarizeCutOff(binary_columns['Age'], 50, 'Young', 'Old'), 
                          'Race':binarizeCategoricalV2(binary_columns['Race'], 'European', ['White'], 'Not_European',
                                                       ['Black or African American', 'Not Reported', 'Asian'])
                         }
                      )

print(new_df)

print(new_df['BMI'].value_counts()) 
print('\n')
print(new_df['Age'].value_counts()) 
print('\n')
print(new_df['Race'].value_counts())

                    BMI    Age          Race Diabetes
Sample_ID                                            
S001       Out_Of_Range    Old      European       No
S002       Out_Of_Range    Old      European       No
S003       Out_Of_Range    Old      European      Yes
S005       Out_Of_Range    Old      European       No
S006           In_Range    Old      European       No
S007       Out_Of_Range    Old      European       No
S008       Out_Of_Range    Old      European       No
S009           In_Range    Old      European       No
S010       Out_Of_Range    Old      European      Yes
S011       Out_Of_Range    Old      European      Yes
S012       Out_Of_Range    Old      European      Yes
S014       Out_Of_Range  Young      European       No
S016       Out_Of_Range    Old      European       No
S017       Out_Of_Range    Old      European      Yes
S018       Out_Of_Range  Young      European      Yes
S019       Out_Of_Range    Old      European       No
S020       Out_Of_Range    O

In [12]:
# Choose columns to test
columns = ['Histologic_Grade_FIGO', 'Ethnicity']
binary_columns2 = clinical[columns]
removeTested(left_to_test, columns)
new_df2 = clinical[columns].copy()

new_df2 = new_df2.assign(**{columns[0]:binarizeCategoricalV2(binary_columns2[columns[0]], 
                                                                        'Grade1', ['FIGO grade 1'], 'Not_Grade1', 
                                                                        ['FIGO grade 2', 'FIGO grade 3']), 
                          columns[1]:binarizeCategoricalV2(binary_columns2[columns[1]], 
                                                       'Hispanic', ['Hispanic or Latino'], 
                                                         'Not_Hispanic', ['Not-Hispanic or Latino', 'Not reported'])
                         }
                      )

for col in new_df2:
    print(new_df2[col].value_counts())
    print('\n')
    
print(new_df2)

['Country', 'Proteomics_Tumor_Normal', 'Myometrial_invasion_Specify', 'Path_Stage_Primary_Tumor-pT', 'Path_Stage_Reg_Lymph_Nodes-pN', 'Clin_Stage_Dist_Mets-cM', 'Path_Stage_Dist_Mets-pM', 'tumor_Stage-Pathological', 'FIGO_stage', 'Tumor_Site', 'Tumor_Site_Other', 'Tumor_Size_cm', 'Num_full_term_pregnancies']
Not_Grade1    46
Grade1        37
Name: Histologic_Grade_FIGO, dtype: int64


Not_Hispanic    55
Hispanic         3
Name: Ethnicity, dtype: int64


          Histologic_Grade_FIGO     Ethnicity
Sample_ID                                    
S001                     Grade1  Not_Hispanic
S002                     Grade1  Not_Hispanic
S003                 Not_Grade1  Not_Hispanic
S005                 Not_Grade1  Not_Hispanic
S006                        NaN  Not_Hispanic
S007                     Grade1  Not_Hispanic
S008                 Not_Grade1  Not_Hispanic
S009                        NaN  Not_Hispanic
S010                     Grade1  Not_Hispanic
S011                     Grade1  Not

In [14]:
binary_columns2 = clinical[['Country', 'Proteomics_Tumor_Normal']]
new_df2 = clinical[['Country', 'Proteomics_Tumor_Normal']].copy()
new_df2 = new_df2.assign(**{'Country':binarizeCategoricalV2(binary_columns2['Country'], 'Ukraine', 
                                                            ['Ukraine'], 'Other', 
                                                            ['United States', 
                                                             'Other_specify', 
                                                             'Poland']), 
                          'Proteomics_Tumor_Normal':binarizeCategoricalV2(binary_columns2['Proteomics_Tumor_Normal'], 
                                                         'Tumor', ['Tumor'], 
                                                         'Other', ['Adjacent_normal', 
                                                                   'Enriched_normal', 
                                                                   'Myometrium_normal'])
                         }
                      )

print(new_df2)

          Country Proteomics_Tumor_Normal
Sample_ID                                
S001        Other                   Tumor
S002        Other                   Tumor
S003        Other                   Tumor
S005        Other                   Tumor
S006        Other                   Tumor
S007        Other                   Tumor
S008        Other                   Tumor
S009        Other                   Tumor
S010        Other                   Tumor
S011        Other                   Tumor
S012        Other                   Tumor
S014        Other                   Tumor
S016        Other                   Tumor
S017        Other                   Tumor
S018        Other                   Tumor
S019        Other                   Tumor
S020        Other                   Tumor
S021        Other                   Tumor
S022        Other                   Tumor
S023        Other                   Tumor
S024        Other                   Tumor
S025        Other                 