In [1]:
# Import blackSheepCPTACmodule and cptac data

import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt, sys
sys.path.insert(0, "/Users/lili/dropbox_lili/common_functions/outlier_analysis/")
import blackSheepCPTACmodule as ol
import importlib
importlib.reload(ol)
import cptac.endometrial as en

Welcome to the cptac data service package. Available datasets may be
viewed using cptac.list_data(). In order to access a specific data
set, import a cptac subfolder using either 'import cptac.dataset' or
'from cptac import dataset'.
******
Version: 0.4.1
******
You have loaded the cptac endometrial dataset. To view available
dataframes, use cptac.endometrial.list_data(). To view available
functions for accessing and manipulating the dataframes, use
cptac.endometrial.list_api().
endometrial data version: 2.1

Loading Dictionary...
Loading cptac endometrial data:
Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic_binary data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available

In [2]:
# Get data
transcriptomics = en.get_transcriptomics()
clinical = en.get_clinical()

# Check to see what options are in the clinical dataset
print(clinical.columns)

Index(['Patient_ID', 'Proteomics_Tumor_Normal', 'Country',
       'Histologic_Grade_FIGO', 'Myometrial_invasion_Specify',
       'Histologic_type', 'Treatment_naive', 'Tumor_purity',
       'Path_Stage_Primary_Tumor-pT', 'Path_Stage_Reg_Lymph_Nodes-pN',
       'Clin_Stage_Dist_Mets-cM', 'Path_Stage_Dist_Mets-pM',
       'tumor_Stage-Pathological', 'FIGO_stage', 'LVSI', 'BMI', 'Age',
       'Diabetes', 'Race', 'Ethnicity', 'Gender', 'Tumor_Site',
       'Tumor_Site_Other', 'Tumor_Focality', 'Tumor_Size_cm',
       'Num_full_term_pregnancies'],
      dtype='object')


In [3]:
# Create subset of clinical dataset to test for trends
annotations = clinical[['Ethnicity','Diabetes', 'BMI']]

In [4]:
# Test of the make_outliers_table function to create a table of outliers
# in the transcriptomics data.
outliers = ol.make_outliers_table(transcriptomics, iqrs=2.0, 
                                  up_or_down='up', aggregate=False, 
                                  frac_table=False)

results = ol.compare_groups_outliers(outliers, annotations, frac_filter=.1)
results

Testing 1462 rows for enrichment in Ethnicity Not-Hispanic or Latino samples
Testing 986 rows for enrichment in Ethnicity Not reported samples
Testing 1150 rows for enrichment in Diabetes No samples
Testing 1156 rows for enrichment in Diabetes Yes samples
Testing 1845 rows for enrichment in BMI 31.0 samples
Testing 2184 rows for enrichment in BMI 27.0 samples


Unnamed: 0,Ethnicity_Not-Hispanic or Latino_enrichment_FDR,Ethnicity_Not reported_enrichment_FDR,Diabetes_No_enrichment_FDR,Diabetes_Yes_enrichment_FDR,BMI_31.0_enrichment_FDR,BMI_27.0_enrichment_FDR
A1BG,,,,,,
A1BG-AS1,,,,,,
A1CF,,,,,,
A2M,,,,,,
A2M-AS1,,,,,,
A2ML1,,,,,,
A2MP1,,,,,,
A3GALT2,,,,,,
A4GALT,,,,,,
A4GNT,,,,,,


In [5]:
non_binary_columns = ['Country', 'Histologic_Grade_FIGO', 
                      'Proteomics_Tumor_Normal', 'Myometrial_invasion_Specify', 
                      'Path_Stage_Primary_Tumor-pT', 'Path_Stage_Reg_Lymph_Nodes-pN', 
                      'Clin_Stage_Dist_Mets-cM', 'Path_Stage_Dist_Mets-pM', 
                      'tumor_Stage-Pathological', 'FIGO_stage', 'BMI', 'Age', 'Race', 
                      'Ethnicity', 'Tumor_Site', 'Tumor_Site_Other', 'Tumor_Size_cm', 
                      'Num_full_term_pregnancies']
left_to_test = non_binary_columns

In [6]:
already_binary_columns = ['Treatment_naive', 'Diabetes', 'Gender', 
                          'Tumor_Focality', 'Histologic_type', 
                          'Tumor_purity', 'LVSI', ]

In [7]:
# Version 5: Probably the best version so far since not everything is done in one function
# Works for Age, Race and BMI as expected. Functions

def binarizeCutOff(my_list, cut_off, replace_low, replace_high):
    return [ replace_low if (x < cut_off) else replace_high if x >= cut_off else x for x in my_list ]

def binarizeRange(my_list, low_bar, high_bar, in_range='In_Range', out_of_range='Out_Of_Range'):
    return [ in_range if (x > low_bar and x < high_bar) else out_of_range  
            if (x < low_bar or x > high_bar) else x for x in my_list ]

'''
I think this binarizeCategorical should work just fine, but for 
whatever reason, it keeps returning 'x' where it should return np.nan
'''

def binarizeCategorical(my_list, option1, option1_list, option2):
    return [ option1 if (x in option1_list) else x if (x == np.nan) else option2 for x in my_list ]
    #return [ option1 if (x in option1_list) else option2 if (x not in option1_list and x != np.nan) else x for x in my_list ]

def binarizeCategoricalV2(my_list, option1, option1_list, option2, option2_list):
    return [ option1 if (x in option1_list) else option2 if (x in option2_list) else x for x in my_list ]

# Function for testing purposes. Not really necessary in final code
def removeTested(list_of_vals, columns_to_remove):
    for col in columns:
        if col in left_to_test:
            print('Removing ' + col + ' from list.')
            left_to_test.remove(col)
    print(left_to_test)
    
# Example Uses for the functions one at a time
'''
BMI: new_df['BMI'] = binarizeRange(binary_columns['BMI'], 18.5, 2)
Age: new_df['Age'] = binarizeCutOff(binary_columns['Age'], 50 , 'Young', 'Old')
Race: new_df['Race'] = binarizeCategorical(binary_columns['Race'], 'European', ['White'], 'Not_European', ['Black or African American', 'Not Reported', 'Asian'])
Diabetes: new_df['Diabetes'] = binarizeCategorical(binary_columns['Diabetes'], 'Diabetic', ['Yes'], 'Not_Diabetic', ['No'])
Histologic_Grade_FIGO: new_df['Histologic_Grade_FIGO'] = binarizeCategoricalV2(binary_columns['Histologic_Grade_FIGO'], 'Grade1', ['FIGO Grade 1'], 'Not_Grade1', ['FIGO Grade 2', 'FIGO Grade 3]
'''

"\nBMI: new_df['BMI'] = binarizeRange(binary_columns['BMI'], 18.5, 2)\nAge: new_df['Age'] = binarizeCutOff(binary_columns['Age'], 50 , 'Young', 'Old')\nRace: new_df['Race'] = binarizeCategorical(binary_columns['Race'], 'European', ['White'], 'Not_European', ['Black or African American', 'Not Reported', 'Asian'])\nDiabetes: new_df['Diabetes'] = binarizeCategorical(binary_columns['Diabetes'], 'Diabetic', ['Yes'], 'Not_Diabetic', ['No'])\nHistologic_Grade_FIGO: new_df['Histologic_Grade_FIGO'] = binarizeCategoricalV2(binary_columns['Histologic_Grade_FIGO'], 'Grade1', ['FIGO Grade 1'], 'Not_Grade1', ['FIGO Grade 2', 'FIGO Grade 3]\n"

In [8]:
# Choose columns to test
columns = ['Age', 'Race', 'BMI', 'Diabetes']
binary_columns = clinical[columns]
removeTested(left_to_test, columns)

Removing Age from list.
Removing Race from list.
Removing BMI from list.
['Country', 'Histologic_Grade_FIGO', 'Proteomics_Tumor_Normal', 'Myometrial_invasion_Specify', 'Path_Stage_Primary_Tumor-pT', 'Path_Stage_Reg_Lymph_Nodes-pN', 'Clin_Stage_Dist_Mets-cM', 'Path_Stage_Dist_Mets-pM', 'tumor_Stage-Pathological', 'FIGO_stage', 'Ethnicity', 'Tumor_Site', 'Tumor_Site_Other', 'Tumor_Size_cm', 'Num_full_term_pregnancies']


In [9]:
# Using the functions to binarize various different columns
binary_columns = clinical[['BMI', 'Age', 'Race', 'Diabetes']]
new_df = binary_columns.copy()
new_df = new_df.assign(**{'BMI':binarizeRange(binary_columns['BMI'], 18.5, 25), 
                          'Age':binarizeCutOff(binary_columns['Age'], 50, 'Young', 'Old'), 
                          'Race':binarizeCategoricalV2(binary_columns['Race'], 'European', ['White'], 'Not_European',
                                                       ['Black or African American', 'Not Reported', 'Asian'])
                         }
                      )

print(new_df)

print(new_df['BMI'].value_counts()) 
print('\n')
print(new_df['Age'].value_counts()) 
print('\n')
print(new_df['Race'].value_counts()) 

                    BMI    Age          Race      Diabetes
Sample_ID                                                 
S001       Out_Of_Range    Old      European  Not_Diabetic
S002       Out_Of_Range    Old      European  Not_Diabetic
S003       Out_Of_Range    Old      European      Diabetic
S005       Out_Of_Range    Old      European  Not_Diabetic
S006           In_Range    Old      European  Not_Diabetic
S007       Out_Of_Range    Old      European  Not_Diabetic
S008       Out_Of_Range    Old      European  Not_Diabetic
S009           In_Range    Old      European  Not_Diabetic
S010       Out_Of_Range    Old      European      Diabetic
S011       Out_Of_Range    Old      European      Diabetic
S012       Out_Of_Range    Old      European      Diabetic
S014       Out_Of_Range  Young      European  Not_Diabetic
S016       Out_Of_Range    Old      European  Not_Diabetic
S017       Out_Of_Range    Old      European      Diabetic
S018       Out_Of_Range  Young      European      Diabet

In [12]:
# Choose columns to test
columns = ['Histologic_Grade_FIGO', 'Ethnicity']
binary_columns2 = clinical[columns]
removeTested(left_to_test, columns)
new_df2 = clinical[columns].copy()

new_df2 = new_df2.assign(**{columns[0]:binarizeCategoricalV2(binary_columns2[columns[0]], 
                                                                        'Grade1', ['FIGO grade 1'], 'Not_Grade1', 
                                                                        ['FIGO grade 2', 'FIGO grade 3']), 
                          columns[1]:binarizeCategoricalV2(binary_columns2[columns[1]], 
                                                       'Hispanic', ['Hispanic or Latino'], 
                                                         'Not_Hispanic', ['Not-Hispanic or Latino', 'Not reported'])
                         }
                      )

for col in new_df2:
    print(new_df2[col].value_counts())
    print('\n')
    
print(new_df2)

['Country', 'Proteomics_Tumor_Normal', 'Myometrial_invasion_Specify', 'Path_Stage_Primary_Tumor-pT', 'Path_Stage_Reg_Lymph_Nodes-pN', 'Clin_Stage_Dist_Mets-cM', 'Path_Stage_Dist_Mets-pM', 'tumor_Stage-Pathological', 'FIGO_stage', 'Tumor_Site', 'Tumor_Site_Other', 'Tumor_Size_cm', 'Num_full_term_pregnancies']
Not_Grade1    46
Grade1        37
Name: Histologic_Grade_FIGO, dtype: int64


Not_Hispanic    55
Hispanic         3
Name: Ethnicity, dtype: int64


          Histologic_Grade_FIGO     Ethnicity
Sample_ID                                    
S001                     Grade1  Not_Hispanic
S002                     Grade1  Not_Hispanic
S003                 Not_Grade1  Not_Hispanic
S005                 Not_Grade1  Not_Hispanic
S006                        NaN  Not_Hispanic
S007                     Grade1  Not_Hispanic
S008                 Not_Grade1  Not_Hispanic
S009                        NaN  Not_Hispanic
S010                     Grade1  Not_Hispanic
S011                     Grade1  Not

In [14]:
binary_columns2 = clinical[['Country', 'Proteomics_Tumor_Normal']]
new_df2 = clinical[['Country', 'Proteomics_Tumor_Normal']].copy()
new_df2 = new_df2.assign(**{'Country':binarizeCategoricalV2(binary_columns2['Country'], 'Ukraine', 
                                                            ['Ukraine'], 'Other', 
                                                            ['United States', 
                                                             'Other_specify', 
                                                             'Poland']), 
                          'Proteomics_Tumor_Normal':binarizeCategoricalV2(binary_columns2['Proteomics_Tumor_Normal'], 
                                                         'Tumor', ['Tumor'], 
                                                         'Other', ['Adjacent_normal', 
                                                                   'Enriched_normal', 
                                                                   'Myometrium_normal'])
                         }
                      )

print(new_df2)

          Country Proteomics_Tumor_Normal
Sample_ID                                
S001        Other                   Tumor
S002        Other                   Tumor
S003        Other                   Tumor
S005        Other                   Tumor
S006        Other                   Tumor
S007        Other                   Tumor
S008        Other                   Tumor
S009        Other                   Tumor
S010        Other                   Tumor
S011        Other                   Tumor
S012        Other                   Tumor
S014        Other                   Tumor
S016        Other                   Tumor
S017        Other                   Tumor
S018        Other                   Tumor
S019        Other                   Tumor
S020        Other                   Tumor
S021        Other                   Tumor
S022        Other                   Tumor
S023        Other                   Tumor
S024        Other                   Tumor
S025        Other                 