In [1]:
import pandas as pd
import cptac
import cptac.pancan as pc
import scipy.stats
import seaborn as sns

import plot_utils as p
import cptac.utils as ut



In [2]:
def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [3]:
def get_corr(df, dup_list):
    
    
    new = pd.DataFrame()
    
    for i in dup_list:
        ids = df.columns[df.columns.str.contains(i)].to_list()
        ids.remove(i+'_flagship')
        flag = i+'_flagship'
        
        for  ali in ids:
            test_df = both[[flag, ali]].dropna() # only plots if value for both flagship and panan
            # Create df with correlations
            first = p.wrap_pearson_corr(test_df, flag)
            new = new.append(first)
    return new

In [4]:
# correlation analysis: compares flagship values to duplicates in pancan
# df is df with both flagship and pancan values (flagship marked as '_flagship')
# dup_list is list of Patient IDs (without aliquots attached)
# y_lab is which source 

def wrap_lin_reg(df, dup_list, y_lab, wa = True):
    for i in dup_list:
        if wa == True:
            if '.N' in i:
                ids = df.columns[df.columns.str.contains(i)].to_list()
            else:
                ids = df.columns[df.columns.str.contains(i) & ~ df.columns.str.contains('\.N')].to_list()
            ids.remove(i+'_flagship')
            flag = i+'_flagship'
        else:
            flag = i+'_flagship'
            ids = [i, i+'_1']           

        for first in ids:
            plot_df = both[[flag, first]].dropna(axis = 'index', how = 'any')
            xd = plot_df[flag].min() + ((plot_df[flag].max() - plot_df[flag].min()) / 3) 
            yd = plot_df[first].max() +1


            p.plot_pearson(plot_df, flag, first, x_coor = xd, y_coor = yd, y_label = first+'_pdc',
                   hue = "none", title = "", ra_stats = True,
                   show_plot = True)

In [5]:
def get_flag_df(omics_name, ca, dup_list):
    if omics_name == 'prot':
        df = ca.get_proteomics()
    
    elif omics_name == 'phospho':
        df = ca.get_phosphoproteomics()
        
    elif omics_name == 'acetyl':
        df = ca.get_acetylproteomics()
        
    if ca.get_cancer_type() == 'brca':
        df.index = [x[1:] if x[0] == 'X' else x for x in df.index]

    df = df.loc[df.index.isin(dup_list)]
    df.index = df.index+'_flagship'
    if isinstance(df.keys(), pd.core.indexes.multi.MultiIndex):
        if omics_name == 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop = ['Database_ID'])
        else: # phospho and acetyl
            df = ut.reduce_multiindex(df, levels_to_drop = ['Peptide', 'Database_ID'], flatten = True)
        df = df.loc[:,~df.columns.duplicated()] # drop all duplicate prot or sites (dup once database ID dropped)
    fdf = df.T
    
    if ca.get_cancer_type() == 'luad' and omics_name == 'phospho':
        # capitalize AA sites 
        fdf.index = fdf.index.str.upper()
    
    return fdf     

In [6]:
def get_pc_df(omics_name, ca_obj, source_name, dup_list = [], with_aliquots = True):
    if omics_name == 'prot':
        df = ca_obj.get_proteomics(source_name)
    
    elif omics_name == 'phospho':
        df = ca_obj.get_phosphoproteomics(source_name)
        
    elif omics_name == 'acetyl':
        df = ca_obj.get_acetylproteomics(source_name)
        
    
    if with_aliquots == False:
        df = df.loc[df.index.duplicated(keep = False)]
        if omics_name != 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop=[1], flatten = True)
            df = df.loc[:,~df.columns.duplicated()]
        df = df.T
        df = df.drop(['aliquot_submitter_id', 'case_submitter_id'])
    
    # When using aliquots, pass in a list of patient_IDs
    else:
        df = df.reset_index()
        if omics_name != 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop=[1], flatten = True)
            df = df.loc[:,~df.columns.duplicated()]
        # brca wasn't mapped (no .N) so use case_submitter_id
        if ca_obj.get_cancer_type() == 'pancanbrca': 
            df = df.loc[df.case_submitter_id.isin(dup_list)] # test with 21BR010 -> only 1 aliquot but part of 18 IDs with normal
            to_drop = ['case_submitter_id', 'aliquot_submitter_id']
            
        # luad and hnscc were mapped so use Patient_ID
        else: 
            df = df.loc[df.Patient_ID.isin(dup_list)]
            to_drop = ['case_submitter_id', 'aliquot_submitter_id', 'Patient_ID']
        df.index = df.case_submitter_id+'_'+df.aliquot_submitter_id
        df = df.T
        df = df.drop(to_drop)
    
    df.index = df.index.str.upper()   
    df = rename_duplicate_cols(df)
        
    return df

# Brca

In [None]:
b = pc.PancanBrca()
brca = cptac.Brca(version='3.1.1')

In [None]:
# show which omics has duplicates
ca = b
source_name = 'pdc'
prot = ca.get_proteomics(source_name)
phos = ca.get_phosphoproteomics(source_name)
acetyl = ca.get_acetylproteomics(source_name)
print('prot:', set(prot.index.duplicated()), len(prot.index[prot.index.duplicated()]), 'duplicates')
print('phospho:', set(phos.index.duplicated()), len(phos.index[phos.index.duplicated()]), 'duplicates')
print('acetyl:', set(acetyl.index.duplicated()), len(acetyl.index[acetyl.index.duplicated()]), 'duplicates')

In [None]:
# From prosp-brca-all-samples.txt
rep = ['11BR031', '11BR053', '11BR036', '11BR060', '14BR005', '11BR011', '21BR010'] # 2 replicates for 11BR036

norm = ['11BR074', '11BR073', '20BR007', '21BR010', '11BR017', '05BR029', '18BR003', '11BR030',
        '01BR027','11BR025', '11BR047', '11BR028', '11BR020', '20BR008', '11BR024', '11BR023',
        '11BR015', '11BR006']

print('IDs with replicates (all tumor):', len(rep))
print('IDs with a normal sample taken:', len(norm))

Comment out everything except line with map_database_to_gene_pdc function in PdcBrca.py formatting section in order to use below code.

In [None]:
#example:
'''
phos = self._data["phosphoproteomics"]
#phos = phos.drop(drop_aliquots, level = 'aliquot_submitter_id') # drop normal aliquots (QC issues)
#phos = phos.rename(index={'604':'CPT000814'}) # use the aliquot for 604
#phos.index = phos.index.droplevel('aliquot_submitter_id')
#phos = rename_duplicate_labels(phos, 'index') # give replicates unique names (checked that only replicates remain)
#phos = average_replicates(phos, id_list = replicates) # average replicates
phos = map_database_to_gene_pdc(phos, 'refseq') # Map refseq IDs to gene names
self._data["phosphoproteomics"] = phos'''

In [None]:
#omics = 'prot'
#omics = 'phospho'
omics = 'acetyl' # needs work
nr_list = norm # choose norm or rep
pcdf = get_pc_df(omics, b, 'pdc', dup_list = nr_list, with_aliquots = True)
pcdf

In [None]:
fdf = get_flag_df(omics, brca, nr_list)
fdf

In [None]:
both = fdf.join(pcdf, how = 'inner')
both = both.dropna(axis = 1, how = 'all')
both

In [None]:
# TypeError: arg must be a list, tuple, 1-d array, or Series
# can occur if column contains on NaN (like with PDC phospho for brca)
for col in both.columns:
    both[col] = pd.to_numeric(both[col])  

In [None]:
wrap_lin_reg(both, nr_list, y_lab = source_name)

In [None]:
print(omics)
get_corr(both, nr_list)

In [None]:
# Check the aliquots we dropped were normal samples using the biospecimen manifest for Brca proteomics on the PDC website.
pdc_prot = pd.read_csv('../../../../Downloads/PDC_biospecimen_manifest_proteomics.tsv', sep='\t')
df = pdc_prot[['Case Submitter ID', 'Aliquot Submitter ID', 'Sample Type', 'Project Name']]
df = df.loc[df['Project Name'] == 'CPTAC2 Confirmatory']
prot_norm = df.loc[df['Sample Type'] == 'Solid Tissue Normal']
pdc_prot = prot_norm['Aliquot Submitter ID'].to_list()
pdc_prot.sort()

prot_dropped_aliquots = ['64ee175f-f3ce-446e-bbf4-9b6fa8_D1', '7ac27de9-0932-4ff5-aab8-29c527',
            '3208e021-1dae-42fd-bd36-0f3c3d', '6c660b6b-bfda-47b0-9499-160d49','241ecd0e-89bd-4d3a-81b3-55a250',
            '428de0d4-7f84-4075-bae1-352af6', '0a80d3c4-0758-447a-958c-ea868c', '53723086-8858-4395-93d7-0baa68',
            '1740224c-32d1-4c9f-98c6-653363', '885fe794-a98e-4f81-a284-ac4bb8', '4749ba99-d3b8-4ae3-b6f6-458bc7',
            '81116212-b7e6-454b-9579-105cf3', '1664b920-5e60-4e3b-9aab-fe121c', '3367406e-d39c-4641-a3e7-44e1f3',
            'e3d45dc6-66ef-4e0b-9d96-1b5db5', '33adae13-5dbd-4530-a5d5-3763e4', 'acf022b3-7f01-43b3-ac14-86f97d',
            '39f81c85-1832-45eb-829a-3040ad']

prot_dropped_aliquots.sort()
print('True = lists equal each other (have all aliquots)')
prot_dropped_aliquots == pdc_prot

In [None]:
# Check the aliquots we dropped were normal samples using the biospecimen manifest for Brca acetylproteomics on the PDC website.
pdc_acetyl = pd.read_csv('../../../../Downloads/PDC_biospecimen_manifest_acetylomics.tsv', sep='\t')
df = pdc_acetyl[['Case Submitter ID', 'Aliquot Submitter ID', 'Sample Type', 'Project Name']]
df = df.loc[df['Project Name'] == 'CPTAC2 Confirmatory']
pdc_acetyl = df.loc[df['Sample Type'] == 'Solid Tissue Normal']
pdc_acetyl = pdc_acetyl['Aliquot Submitter ID'].to_list()
pdc_acetyl.sort()

ac_dropped_aliquots = ['3208e021-1dae-42fd-bd36-0f3c3d', '39f81c85-1832-45eb-829a-3040ad', '4749ba99-d3b8-4ae3-b6f6-458bc7',
              '81116212-b7e6-454b-9579-105cf3', '0a80d3c4-0758-447a-958c-ea868c', '3367406e-d39c-4641-a3e7-44e1f3',
              '241ecd0e-89bd-4d3a-81b3-55a250', '428de0d4-7f84-4075-bae1-352af6', '7ac27de9-0932-4ff5-aab8-29c527',
              'acf022b3-7f01-43b3-ac14-86f97d', '1664b920-5e60-4e3b-9aab-fe121c', '6c660b6b-bfda-47b0-9499-160d49',
              'e3d45dc6-66ef-4e0b-9d96-1b5db5', '64ee175f-f3ce-446e-bbf4-9b6fa8_D1', '33adae13-5dbd-4530-a5d5-3763e4',
              '53723086-8858-4395-93d7-0baa68', '1740224c-32d1-4c9f-98c6-653363', '885fe794-a98e-4f81-a284-ac4bb8']
ac_dropped_aliquots.sort()
print('True = lists equal each other (have all aliquots)')
ac_dropped_aliquots == pdc_acetyl

In [None]:
# Are the normal aliquots the same between omics? Yes
prot_dropped_aliquots == ac_dropped_aliquots

# Hnscc

Check correlation of replicates with each other (confirmed same tissue type).

In [None]:
h = pc.PancanHnscc()
hnscc = cptac.Hnscc()

In [None]:
# show which omics has duplicates
ca = h
source_name = 'umich'
prot = ca.get_proteomics(source_name)
phos = ca.get_phosphoproteomics(source_name)
#acetyl = ca.get_acetylproteomics(source_name) # not included in pdchnscc
print('prot:', set(prot.index.duplicated()), list(prot.index[prot.index.duplicated()]))
print('phospho:', set(phos.index.duplicated()), list(set(phos.index[phos.index.duplicated()])))
#print('acetyl:', set(acetyl.index.duplicated()))

In [None]:
# umich
df = prot.reset_index()
df = df.replace('-T-duplicate', '', regex = True)
df = df.replace('-N-duplicate\d?', '.N', regex = True)
df.loc[df.Patient_ID.str.contains('duplicate')]

In [None]:
# Keep aliquots
df = h.get_phosphoproteomics('pdc')
df = df.reset_index()
df = df.loc[df.Patient_ID.duplicated(keep = False)]
df = ut.reduce_multiindex(df, levels_to_drop = [1], flatten=True)
df = df.loc[:,~df.columns.duplicated()] # drop all duplicate prot or sites (dup once database ID dropped)
df.index = df.Patient_ID+'_'+df.aliquot_submitter_id # uncomment code that dropped aliquot in pancan to use
df = df.T
df = df.drop(['aliquot_submitter_id', 'Patient_ID','case_submitter_id'])
pcdf = rename_duplicate_cols(df)
pcdf.index = pcdf.index.str.upper()
pcdf

In [None]:
# compare correlations between pancan duplicates

df = pcdf
for col in df.columns:
    df[col] = pd.to_numeric(df[col])  

dup_list = ['C3L-02617', 'C3L-00994.N', 'C3L-02617.N'] 

for i in dup_list:
    if '.N' in i:
        dups = df.columns[df.columns.str.contains(i)].to_list()
    else:
        dups = df.columns[df.columns.str.contains(i) & ~ df.columns.str.contains('\.N')].to_list()
    
    print(i)
    first = dups[0]
    second = dups[1]
    
    plot_df = df[[first, second]].dropna(axis = 'index', how = 'any')
    xd = plot_df[first].min() + ((plot_df[first].max() - plot_df[first].min()) / 3) 
    yd = plot_df[second].max() + 2

    p.plot_pearson(plot_df, first, second, x_coor = xd, y_coor = yd, 
           hue = "none", title = "", ra_stats = True,
           show_plot = True)
        
    if len(dups) == 3:
        third = dups[2]
        plot_df2 = df[[second, third]].dropna(axis = 'index', how = 'any')
        xd = plot_df2[second].min() + ((plot_df2[second].max() - plot_df2[second].min()) / 3)
        yd = plot_df2[third].max() + 2                       
        p.plot_pearson(plot_df2, second, third, x_coor = xd, y_coor = yd,
               hue = "none", title = "", ra_stats = True,
               show_plot = True)
        
        plot_df3 = df[[first, third]].dropna(axis = 'index', how = 'any')
        xd = plot_df3[first].min() + ((plot_df3[first].max() - plot_df3[first].min()) / 3)
        yd = plot_df3[third].max() + 2
        p.plot_pearson(plot_df3, first, third, x_coor = xd, y_coor = yd,
               hue = "none", title = "", ra_stats = True,
               show_plot = True)

In [None]:
# Flagship
hnscc_dup = ['C3L-02617', 'C3L-00994.N', 'C3L-02617.N']
fdf = get_flag_df('phospho', hnscc, hnscc_dup)
fdf

In [None]:
both = fdf.join(pcdf, how = 'inner')
both.head()
both

In [None]:
for col in both.columns:
    both[col] = pd.to_numeric(both[col])  

In [None]:
# compare to flagship - doesn't correlate well
wrap_lin_reg(both, hnscc_dup, y_lab = source_name)

# Luad

In [7]:
l = pc.PancanLuad()
luad = cptac.Luad()

                                              

In [None]:
# show which omics has duplicates
ca = l
source_name = 'pdc'
prot = ca.get_proteomics(source_name)
phos = ca.get_phosphoproteomics(source_name)
acetyl = ca.get_acetylproteomics(source_name)
print('prot:', set(prot.index.duplicated()), list(prot.index[prot.index.duplicated()]))
print('phospho:', set(phos.index.duplicated()), list(phos.index[phos.index.duplicated()]))
print('acetyl:', set(acetyl.index.duplicated()), list(acetyl.index[acetyl.index.duplicated()]))

In [8]:
a=luad.get_acetylproteomics()
print(a.isnull().values.any())
a

True


Name,A2M,A2M,A2M,A2M,A2M,A2M,A2M,A2M,AAAS,AASS,...,ZNHIT1,ZNHIT1,ZRANB2,ZRANB2,ZRANB2,ZSCAN18,ZSCAN31,ZYX,ZYX,ZYX
Site,K1092,K1168,K1176,K145,K349,K516,K841N851,N662K664,K52,K138,...,K53,K66,K43,K54,K59,K422,K215,K24,K25,K279
Peptide,SSGSLLNNAIk,EVLkSLNEEAVK,SLNEEAVkK,SIYKPGQTVkFR,TITkLSFVK,TGTHGLLVkQEDMK,VSVQLEASPAFLAVPVEkEQAPHCICAnGR,HNVYINGITYTPVSSTnEk,GQWINLPVLQLTkDPLK,LIDYEkMVDHR,...,QLEALENDNFQDDPHAGLPQLGkR,RLPQFDDDADTGkK,TTEAkMMK,AGGTEIGkTLAEK,TLAEkSR,GTAkLGTK,DVSLDSkYR,PSPAISVSVSAPAFYAPQkK,kFGPVVAPK,FTPVASkFSPGAPGGSGSQPNQK
Database_ID,NP_002855.2|NP_000005.2|NP_001334353.1|NP_001334354.1,NP_000005.2|NP_001334353.1|NP_001334354.1,NP_000005.2|NP_001334353.1|NP_001334354.1,NP_000005.2|NP_001334353.1,NP_000005.2|NP_001334353.1|NP_001334354.1,NP_000005.2|NP_001334354.1,NP_000005.2|NP_001334353.1|NP_001334354.1,NP_000005.2|NP_001334353.1|NP_001334354.1,NP_056480.1|NP_001166937.1,NP_005754.2,...,NP_006340.1,NP_006340.1,NP_976225.1|NP_005446.2,NP_976225.1|NP_005446.2,NP_976225.1|NP_005446.2,NP_001139014.1|NP_001139015.1|NP_001139016.1,NP_001128687.1|NP_001230171.1,NP_001010972.1|NP_001349712.1,NP_001010972.1|NP_001349712.1,NP_001010972.1|NP_001349712.1
Patient_ID,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
C3L-00001,0.8466,,,,,,,0.0431,1.0804,,...,,1.6155,,-0.1037,1.3083,0.4874,2.3360,-1.6066,-0.9942,-0.8204
C3L-00009,,,-0.8921,1.7181,,-1.4155,,0.3232,-0.5905,,...,,-0.2135,,-1.5885,-0.8212,0.1568,,-1.0208,-0.6570,-1.5530
C3L-00080,,,0.3115,-0.1409,,,,-1.3800,-0.2844,-0.2675,...,,1.0729,1.3717,-0.0109,-0.1611,,-1.0288,-0.3046,-1.2230,-1.4914
C3L-00083,2.2170,,4.1378,5.7329,2.8518,2.5041,,,,,...,-0.1122,-0.7938,,-0.7690,,,,0.2770,-1.2768,-0.0349
C3L-00093,1.2599,-1.3746,1.1798,-1.0192,-0.4608,-0.7680,0.5919,-0.2577,,,...,1.5699,1.0515,0.5064,-0.6505,,-1.0620,0.9313,-0.0360,1.9653,-0.4100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02582.N,,,-0.1520,3.5939,0.5367,,,0.1673,3.3652,,...,-0.2211,-1.8272,,-0.0495,,-0.1830,,,1.0681,-0.0042
C3N-02586.N,,2.9990,2.7982,0.5220,0.1622,2.4410,0.0449,0.2874,,,...,-0.6434,-1.3370,,0.3864,0.3734,,,0.1831,0.4151,-1.4621
C3N-02587.N,,0.2923,-0.6830,0.3901,0.2020,,,-1.8763,,0.5656,...,-1.4426,-1.2345,-1.5028,0.4226,,0.0090,,-0.5150,0.5229,0.1419
C3N-02588.N,,,2.3460,0.1487,-0.7344,,,,,,...,,-1.3447,-1.4092,-0.6198,,,,-1.7591,-0.9906,-0.1387


In [10]:
a=l.get_acetylproteomics('pdc')
print(a.isnull().values.any())
a

False


Name,A2M,A2M,A2M,A2M,A2M,A2M,A2M,A2M,A2M,A2M,...,ZRANB2,ZW10,ZYX,ZYX,ZYX,ZZEF1,ZZZ3,ZZZ3,aliquot_submitter_id,case_submitter_id
Site,K1047,K1075,K1092,K1133,K1162,K1168,K1176,K1177,K1315,K135,...,K54,K634,K25,K279,K533,K2306,K161,K693,NaN,NaN
Database_ID,NP_000005.2,NP_000005.2,NP_000005.2,NP_000005.2,NP_000005.2,NP_000005.2,NP_000005.2,NP_000005.2,NP_000005.2,NP_000005.2,...,NP_005446.2,NP_004715.1,NP_001010972.1,NP_001010972.1,NP_001010972.1,NP_055928.3,NP_056349.1,NP_056349.1,aliquot_submitter_id,case_submitter_id
Patient_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
11LU013,0.0,0.0,-1.2854,0.0,0.0,-0.845,-0.2807,0.0,0.0,0.0,...,0.3162,1.1813,0.0,-0.4585,0.0,0.0,1.1249,0.0,CPT0053040004,11LU013
11LU016,0.0,0.0,-0.3704,0.0,0.0,0.0,-0.6854,0.0,0.0,0.0,...,0.0405,0.0,0.0,0.0012,0.0,0.0,0.0,0.0,CPT0052940004,11LU016
11LU022,0.0,0.0,-0.4639,0.0,0.0,-0.3659,0.1329,0.0,0.0,-0.8698,...,-0.1089,0.0,0.0,0.3444,0.0,0.0,0.4297,0.0,CPT0052170004,11LU022
11LU035,0.0,0.0,0.0,0.0,0.0,0.0,-1.1599,0.0,0.0,0.0,...,-0.0266,0.0,0.0,-0.4237,0.0,0.0184,0.0,0.0,CPT0051690004,11LU035
C3L-00001,0.0,0.0,0.5465,0.0,0.0,0.0,-0.2958,0.0,0.0,0.0,...,0.0613,0.0,0.0,-0.0476,0.0,-0.0702,0.3532,0.0,CPT0001580009,C3L-00001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02582.N,0.0,0.0,-0.2326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.2175,0.0,0.0,0.0,0.0,CPT0147900003,C3N-02582
C3N-02586.N,0.0,0.2565,0.0,0.0,0.0,1.035,1.4368,0.0,0.0,0.0,...,-0.101,0.0,0.0542,-0.6578,0.0,0.0,0.0,0.0,CPT0148000003,C3N-02586
C3N-02587.N,0.0,0.0,0.5658,0.0,0.0,0.2637,0.6163,0.0,0.0,-0.0936,...,0.0984,0.0,0.0,-0.1604,0.0,0.0,0.0,0.0,CPT0148100003,C3N-02587
C3N-02588.N,0.9613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.2828,0.193,0.0,0.3348,0.0,0.0,0.0,0.0,CPT0148200003,C3N-02588


In [23]:
def get_pc_df(omics_name, ca_obj, source_name, dup_list = [], with_aliquots = True):
    if omics_name == 'prot':
        df = ca_obj.get_proteomics(source_name)
    
    elif omics_name == 'phospho':
        df = ca_obj.get_phosphoproteomics(source_name)
        
    elif omics_name == 'acetyl':
        df = ca_obj.get_acetylproteomics(source_name)
        
    
    if with_aliquots == False:
        df = df.loc[df.index.duplicated(keep = False)]
        if omics_name != 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop=[1], flatten = True)
            df = df.loc[:,~df.columns.duplicated()]
        df = df.T
        df = df.drop(['aliquot_submitter_id', 'case_submitter_id'])
    
    # When using aliquots, pass in a list of patient_IDs
    else:
        df = df.reset_index()
        if omics_name != 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop=[2], flatten = True)
            df = df.loc[:,~df.columns.duplicated()]
        # brca wasn't mapped (no .N) so use case_submitter_id
        if ca_obj.get_cancer_type() == 'pancanbrca': 
            df = df.loc[df.case_submitter_id.isin(dup_list)] # test with 21BR010 -> only 1 aliquot but part of 18 IDs with normal
            to_drop = ['case_submitter_id', 'aliquot_submitter_id']
            
        # luad and hnscc were mapped so use Patient_ID
        else: #if ca_obj.get_cancer_type() == 'pancanhnscc': 
            df = df.loc[df.Patient_ID.isin(dup_list)]
            to_drop = ['case_submitter_id', 'aliquot_submitter_id', 'Patient_ID']
            df.index = df.case_submitter_id+'_'+df.aliquot_submitter_id
        '''
        elif ca_obj.get_cancer_type() == 'pancanluad': 
            df = df.loc[df.Patient_ID.isin(dup_list)]
            to_drop = ['case_submitter_id_case_submitter_id', 'aliquot_submitter_id_aliquot_submitter_id', 'Patient_ID']
            df.index = df.case_submitter_id_case_submitter_id+'_'+df.aliquot_submitter_id_aliquot_submitter_id'''
            
        
        df = df.T
        df = df.drop(to_drop)
    
    df.index = df.index.str.upper()   
    df = rename_duplicate_cols(df)
        
    return df

In [24]:
#omics = 'phospho'
omics = 'acetyl'
dup_list = ['C3N-02379', 'C3N-02587']
pcdf = get_pc_df(omics, l, 'pdc', dup_list)
#pcdf = pcdf.replace(0, np.nan)
pcdf



AttributeError: 'DataFrame' object has no attribute 'case_submitter_id_case_submitter_id'

In [None]:
dup_list = ['C3N-02379', 'C3N-02587']
fdf = get_flag_df(omics, luad, dup_list)
fdf

In [None]:
both = fdf.join(pcdf, how = 'inner')
both.head()
both

for col in both.columns:
    both[col] = pd.to_numeric(both[col])  

wrap_lin_reg(both, dup_list, y_lab = source_name)

In [None]:
print(omics)
get_corr(both, dup_list)