## Use Case 3: Comparing BMI above and below 25 across the proteomics data

These are the tools we will use to play with the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

This is the data we will be playing with

In [2]:
import CPTAC

Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Data...

 ******PLEASE READ******


Use case 3: BMI above and below 25
The first step is to load the clinical dataframe and the dataframe to compare it with

In [3]:
clinical = CPTAC.get_clinical()
proteomics = CPTAC.get_proteomics()

Next we will use the compare_clinical() function to create a dataframe that appends a column from the clinical dataframe to our chosen dataframe

In [9]:
print(clinical.columns)

Index(['Proteomics_Aliquot_ID', 'Proteomics_Participant_ID',
       'Proteomics_TMT_batch', 'Proteomics_TMT_plex', 'Proteomics_TMT_channel',
       'Proteomics_Parent_Sample_IDs', 'Proteomics_Tumor_normal',
       'Proteomics_OCT', 'WXS_patient_id', 'WXS_Tumor_sample_id',
       'WXS_Tumor_file', 'WXS_Tumor_UUID', 'WXS_Tumor_type',
       'WXS_Normal_sample_id', 'WXS_Normal_file', 'WXS_Normal_UUID',
       'WXS_Normal_type', 'RNAseq_sample_id', 'RNAseq_patient_id',
       'RNAseq_sample_type', 'RNAseq_UUID_R1', 'RNAseq_filename_R1',
       'RNAseq_UUID_R2', 'RNAseq_filename_R2', 'methylation_sample_id',
       'Histologic_Grade_(FIGO)', 'Histologic_Type',
       'Num_full_term_pregnancies', 'Tumor_Size_(cm)', 'FIGO_stage',
       'Myometrial_invasion_Specify', 'tumor_Stage_Pathological', 'Diabetes',
       'BMI', 'LVSI', 'Endo_S1G1G2_LVSI', 'Age',
       'CIBERSORT-T_cells_CD4_memory_resting',
       'CIBERSORT-Dendritic_cells_resting',
       'CIBERSORT-T_cells_regulatory_(Tregs)', 'C

In [85]:
trait = 'CIBERSORT-T_cells_CD4_memory_activated';

In [86]:
traitProt = CPTAC.compare_clinical(clinical, proteomics, trait)
print(traitProt)

      CIBERSORT-T_cells_CD4_memory_activated  A1BG   A2M  A2ML1  A4GALT  AAAS  \
idx                                                                             
S001                                0.000000 -1.01 -0.81  -0.28    0.24  0.29   
S002                                0.000000 -0.51 -1.00  -0.99    1.50  0.18   
S003                                0.000000 -0.56 -1.33   0.64     NaN -0.26   
S004                                0.000000 -1.53 -1.19  -0.49    0.26 -0.03   
S005                                0.026120 -0.16  0.09   0.01    0.34  0.51   
S006                                0.000000 -1.03 -0.63  -0.04   -0.25 -0.09   
S007                                0.000000 -1.09 -0.60  -1.11    0.02  0.16   
S008                                0.000000 -0.29  0.51  -0.51     NaN  0.46   
S009                                0.002505 -0.93 -1.28   0.67    0.43 -0.05   
S010                                0.000000 -0.44 -0.87   2.83   -0.32  0.18   
S011                        

We can now check for genes that have a significantly different protein abundance between the high and low BMI. First we need to set a more accurate threshold since we have so many samples.

In [88]:
threshold = .05 / len(highTrait.columns) #TODO: this doesn't yield anything significant yet
tscutoff = 0.5
print("Threshold:", threshold)
significantTests = []
significantGenes = []
for num in range(1,len(traitProt.columns)):
    gene = traitProt.columns[num]
    oneGene = traitProt[[trait, gene]]
    oneGene = oneGene.dropna(axis=0)
    pearsonrTest = stats.pearsonr(oneGene[trait], oneGene[gene])
    if (abs(pearsonrTest[0]) >= tscutoff) and (pearsonrTest[1] <= threshold):
        print(pearsonrTest)
        significantTests.append(pearsonrTest)
        significantGenes.append(gene)
print(len(significantGenes))
print(significantGenes)

Threshold: 5.215939912372209e-06
(0.5954546898804134, 6.390277473094786e-11)
(0.5055666609728869, 8.094061613475806e-08)
(0.5142474682962077, 2.3664236984920564e-06)
(0.5159220615804712, 3.939078656596895e-08)
(0.5554446708410383, 2.311988835846832e-07)
(0.5092116571137849, 6.29878026289408e-08)
(0.5671292890110862, 2.8231954170521566e-06)
(0.5353594597849946, 9.536420555785089e-09)
(-0.5040190921349618, 2.185750226258991e-06)
(0.5365065934768141, 3.540381061315309e-08)
(0.5101685867856048, 5.894576927593014e-08)
(0.5506710752065497, 2.9239765756052913e-09)
(0.5316265311109988, 1.2609996803395728e-08)
(0.5510230261411645, 2.3797619768690187e-06)
(0.5318836550411455, 1.2371001992901337e-08)
(0.605514861986393, 2.506866932169879e-06)
16
['ABI3', 'AIF1', 'AOAH', 'APOBEC3G', 'ASB6', 'BTK', 'CALHM6', 'CRLF3', 'DOP1A', 'ETS1', 'GBP1', 'GBP4', 'GBP5', 'IRF4', 'LCP2', 'POU2F2']
