In [1]:
import pandas as pd
import numpy as np
import cptac
import cptac.pancan as pc
import scipy.stats
import seaborn as sns

import plot_utils as p
import cptac.utils as ut

In [None]:
def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [None]:
def get_corr(df, dup_list):
    
    
    new = pd.DataFrame()
    
    for i in dup_list:
        ids = df.columns[df.columns.str.contains(i)].to_list()
        ids.remove(i+'_flagship')
        flag = i+'_flagship'
        
        for  ali in ids:
            test_df = both[[flag, ali]].dropna() # only plots if value for both flagship and panan
            # Create df with correlations
            first = p.wrap_pearson_corr(test_df, flag)
            new = new.append(first)
    return new

In [None]:
# correlation analysis: compares flagship values to duplicates in pancan
# df is df with both flagship and pancan values (flagship marked as '_flagship')
# dup_list is list of Patient IDs (without aliquots attached)
# y_lab is which source 

def wrap_lin_reg(df, dup_list, y_lab, wa = True):
    for i in dup_list:
        try:
            if wa == True:
                if '.N' in i:
                    ids = df.columns[df.columns.str.contains(i)].to_list()
                else:
                    ids = df.columns[df.columns.str.contains(i) & ~ df.columns.str.contains('\.N')].to_list()
                ids.remove(i+'_flagship')
                flag = i+'_flagship'
            else:
                flag = i+'_flagship'
                ids = [i, i+'_1']           

            for first in ids:
                plot_df = both[[flag, first]].dropna(axis = 'index', how = 'any')
                xd = plot_df[flag].min() + ((plot_df[flag].max() - plot_df[flag].min()) / 3) 
                yd = plot_df[first].max() +1


                p.plot_pearson(plot_df, flag, first, x_coor = xd, y_coor = yd, y_label = first+'_pdc',
                       hue = "none", title = "", ra_stats = True,
                       show_plot = True)
        except:
            continue

In [None]:
def get_flag_df(omics_name, ca, dup_list):
    if omics_name == 'prot':
        df = ca.get_proteomics()
    
    elif omics_name == 'phospho':
        df = ca.get_phosphoproteomics()
        
    elif omics_name == 'acetyl':
        df = ca.get_acetylproteomics()
        
    if ca.get_cancer_type() == 'brca':
        df.index = [x[1:] if x[0] == 'X' else x for x in df.index]

    df = df.loc[df.index.isin(dup_list)]
    df.index = df.index+'_flagship'
    if isinstance(df.keys(), pd.core.indexes.multi.MultiIndex):
        if omics_name == 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop = ['Database_ID'])
        else: # phospho and acetyl
            df = ut.reduce_multiindex(df, levels_to_drop = ['Peptide', 'Database_ID'], flatten = True)
        df = df.loc[:,~df.columns.duplicated()] # drop all duplicate prot or sites (dup once database ID dropped)
    fdf = df.T
    
    if ca.get_cancer_type() == 'luad' and omics_name == 'phospho':
        # capitalize AA sites 
        fdf.index = fdf.index.str.upper()
    
    return fdf     

In [None]:
def get_pc_df(omics_name, ca_obj, source_name, dup_list = [], with_aliquots = True):
    if omics_name == 'prot':
        df = ca_obj.get_proteomics(source_name)
    
    elif omics_name == 'phospho':
        df = ca_obj.get_phosphoproteomics(source_name)
        
    elif omics_name == 'acetyl':
        df = ca_obj.get_acetylproteomics(source_name)
        
    
    if with_aliquots == False:
        df = df.loc[df.index.duplicated(keep = False)]
        if omics_name != 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop=[2], flatten = True)
            df = df.loc[:,~df.columns.duplicated()]
        df = df.T
        df = df.drop(['aliquot_submitter_id', 'case_submitter_id'])
    
    # When using aliquots, pass in a list of patient_IDs
    else:
        df = df.reset_index()
        if omics_name != 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop=[2], flatten = True)
            df = df.loc[:,~df.columns.duplicated()]
        # brca wasn't mapped (no .N) so use case_submitter_id
        if ca_obj.get_cancer_type() == 'pancanbrca': 
            df = df.loc[df.case_submitter_id.isin(dup_list)] # test with 21BR010 -> only 1 aliquot but part of 18 IDs with normal
            to_drop = ['case_submitter_id', 'aliquot_submitter_id']
            
        # luad and hnscc were mapped so use Patient_ID
        else: 
            df = df.loc[df.Patient_ID.isin(dup_list)]
            to_drop = ['case_submitter_id', 'aliquot_submitter_id', 'Patient_ID']
        df.index = df.case_submitter_id+'_'+df.aliquot_submitter_id
        df = df.T
        df = df.drop(to_drop)
    
    df.index = df.index.str.upper()   
    df = rename_duplicate_cols(df)
        
    return df

In [None]:
pdc=pc.PancanPdac()


In [2]:
h=pc.PancanHnscc()

                                               

In [12]:
p=h.get_phosphoproteomics('pdc')
p.index.to_list()

['C3L-00977',
 'C3L-00987',
 'C3L-00994',
 'C3L-00995',
 'C3L-00997',
 'C3L-00999',
 'C3L-01138',
 'C3L-01237',
 'C3L-02617',
 'C3L-02621',
 'C3L-02651',
 'C3L-03378',
 'C3L-04025',
 'C3L-04354',
 'C3L-04791',
 'C3L-04844',
 'C3L-04849',
 'C3N-00204',
 'C3N-00295',
 'C3N-00297',
 'C3N-00299',
 'C3N-00306',
 'C3N-00307',
 'C3N-00498',
 'C3N-00519',
 'C3N-00822',
 'C3N-00825',
 'C3N-00828',
 'C3N-00829',
 'C3N-00846',
 'C3N-00857',
 'C3N-00871',
 'C3N-01337',
 'C3N-01338',
 'C3N-01339',
 'C3N-01340',
 'C3N-01620',
 'C3N-01643',
 'C3N-01645',
 'C3N-01752',
 'C3N-01754',
 'C3N-01755',
 'C3N-01756',
 'C3N-01757',
 'C3N-01758',
 'C3N-01858',
 'C3N-01859',
 'C3N-01943',
 'C3N-01944',
 'C3N-01945',
 'C3N-01946',
 'C3N-01947',
 'C3N-01948',
 'C3N-02275',
 'C3N-02279',
 'C3N-02333',
 'C3N-02693',
 'C3N-02694',
 'C3N-02695',
 'C3N-02700',
 'C3N-02713',
 'C3N-02714',
 'C3N-02716',
 'C3N-02727',
 'C3N-02730',
 'C3N-02925',
 'C3N-03008',
 'C3N-03009',
 'C3N-03011',
 'C3N-03012',
 'C3N-03013',
 'C3N-

In [7]:
p2 = p.loc[p.index.str.contains('CPT')]
#p2.rename(index = manually_mapped)
p2

Unnamed: 0_level_0,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZSCAN25,ZSWIM8,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [None]:
pdac_manually_mapped = {'CPT0347760002': 'C3L-07032.N', 'CPT0347790002': 'C3L-07033.N',
        'CPT0347820002': 'C3L-07034.N', 'CPT0347850002': 'C3L-07035.N', 'CPT0347880002': 'C3L-07036.N',
        'CPT0355180003': 'C3L-03513.N', 'CPT0355190003': 'C3L-03515.N', 'CPT0355200003': 'C3L-03514.N'}

In [None]:
# hnscc
manually_mapped = {'CPT0169740004': 'C3L-00994.N', 'CPT0229220002': 'C3L-02617.N',
            'CPT0163250003': 'C3N-01757.N', 'CPT0235470002': 'C3N-03042.N',
            'CPT0278700002': 'C3L-04350.N', 'CPT0281470002':'C3L-05257.N'}

In [None]:
# 2 aliquots not in biospec file (don't know tissue type)
#['CPT0278700002': 'C3L-04350', 'CPT0281470002':'C3L-05257'] 

# Brca

In [None]:
b = pc.PancanBrca()
brca = cptac.Brca(version='3.1.1')

In [None]:
# show which omics has duplicates
ca = b
source_name = 'pdc'
prot = ca.get_proteomics(source_name)
phos = ca.get_phosphoproteomics(source_name)
acetyl = ca.get_acetylproteomics(source_name)
print('prot:', set(prot.index.duplicated()), len(prot.index[prot.index.duplicated()]), 'duplicates')
print('phospho:', set(phos.index.duplicated()), len(phos.index[phos.index.duplicated()]), 'duplicates')
print('acetyl:', set(acetyl.index.duplicated()), len(acetyl.index[acetyl.index.duplicated()]), 'duplicates')

In [None]:
# From prosp-brca-all-samples.txt
rep = ['11BR031', '11BR053', '11BR036', '11BR060', '14BR005', '11BR011', '21BR010'] # 2 replicates for 11BR036

norm = ['11BR074', '11BR073', '20BR007', '21BR010', '11BR017', '05BR029', '18BR003', '11BR030',
        '01BR027','11BR025', '11BR047', '11BR028', '11BR020', '20BR008', '11BR024', '11BR023',
        '11BR015', '11BR006']

print('IDs with replicates (all tumor):', len(rep))
print('IDs with a normal sample taken:', len(norm))

Comment out everything except line with map_database_to_gene_pdc function in PdcBrca.py formatting section in order to use below code.

In [None]:
#example:
'''
phos = self._data["phosphoproteomics"]
#phos = phos.drop(drop_aliquots, level = 'aliquot_submitter_id') # drop normal aliquots (QC issues)
#phos = phos.rename(index={'604':'CPT000814'}) # use the aliquot for 604
#phos.index = phos.index.droplevel('aliquot_submitter_id')
#phos = rename_duplicate_labels(phos, 'index') # give replicates unique names (checked that only replicates remain)
#phos = average_replicates(phos, id_list = replicates) # average replicates
phos = map_database_to_gene_pdc(phos, 'refseq') # Map refseq IDs to gene names
self._data["phosphoproteomics"] = phos'''

In [None]:
#omics = 'prot'
#omics = 'phospho'
omics = 'acetyl' # needs work
nr_list = norm # choose norm or rep
pcdf = get_pc_df(omics, b, 'pdc', dup_list = nr_list, with_aliquots = True)
pcdf

In [None]:
fdf = get_flag_df(omics, brca, nr_list)
fdf

In [None]:
both = fdf.join(pcdf, how = 'inner')
both = both.dropna(axis = 1, how = 'all')
both

In [None]:
# TypeError: arg must be a list, tuple, 1-d array, or Series
# can occur if column contains on NaN (like with PDC phospho for brca)
for col in both.columns:
    both[col] = pd.to_numeric(both[col])  

In [None]:
wrap_lin_reg(both, nr_list, y_lab = source_name)

In [None]:
print(omics)
get_corr(both, nr_list)

In [None]:
# Check the aliquots we dropped were normal samples using the biospecimen manifest for Brca proteomics on the PDC website.
pdc_prot = pd.read_csv('../../../../Downloads/PDC_biospecimen_manifest_proteomics.tsv', sep='\t')
df = pdc_prot[['Case Submitter ID', 'Aliquot Submitter ID', 'Sample Type', 'Project Name']]
df = df.loc[df['Project Name'] == 'CPTAC2 Confirmatory']
prot_norm = df.loc[df['Sample Type'] == 'Solid Tissue Normal']
pdc_prot = prot_norm['Aliquot Submitter ID'].to_list()
pdc_prot.sort()

prot_dropped_aliquots = ['64ee175f-f3ce-446e-bbf4-9b6fa8_D1', '7ac27de9-0932-4ff5-aab8-29c527',
            '3208e021-1dae-42fd-bd36-0f3c3d', '6c660b6b-bfda-47b0-9499-160d49','241ecd0e-89bd-4d3a-81b3-55a250',
            '428de0d4-7f84-4075-bae1-352af6', '0a80d3c4-0758-447a-958c-ea868c', '53723086-8858-4395-93d7-0baa68',
            '1740224c-32d1-4c9f-98c6-653363', '885fe794-a98e-4f81-a284-ac4bb8', '4749ba99-d3b8-4ae3-b6f6-458bc7',
            '81116212-b7e6-454b-9579-105cf3', '1664b920-5e60-4e3b-9aab-fe121c', '3367406e-d39c-4641-a3e7-44e1f3',
            'e3d45dc6-66ef-4e0b-9d96-1b5db5', '33adae13-5dbd-4530-a5d5-3763e4', 'acf022b3-7f01-43b3-ac14-86f97d',
            '39f81c85-1832-45eb-829a-3040ad']

prot_dropped_aliquots.sort()
print('True = lists equal each other (have all aliquots)')
prot_dropped_aliquots == pdc_prot

In [None]:
# Check the aliquots we dropped were normal samples using the biospecimen manifest for Brca acetylproteomics on the PDC website.
pdc_acetyl = pd.read_csv('../../../../Downloads/PDC_biospecimen_manifest_acetylomics.tsv', sep='\t')
df = pdc_acetyl[['Case Submitter ID', 'Aliquot Submitter ID', 'Sample Type', 'Project Name']]
df = df.loc[df['Project Name'] == 'CPTAC2 Confirmatory']
pdc_acetyl = df.loc[df['Sample Type'] == 'Solid Tissue Normal']
pdc_acetyl = pdc_acetyl['Aliquot Submitter ID'].to_list()
pdc_acetyl.sort()

ac_dropped_aliquots = ['3208e021-1dae-42fd-bd36-0f3c3d', '39f81c85-1832-45eb-829a-3040ad', '4749ba99-d3b8-4ae3-b6f6-458bc7',
              '81116212-b7e6-454b-9579-105cf3', '0a80d3c4-0758-447a-958c-ea868c', '3367406e-d39c-4641-a3e7-44e1f3',
              '241ecd0e-89bd-4d3a-81b3-55a250', '428de0d4-7f84-4075-bae1-352af6', '7ac27de9-0932-4ff5-aab8-29c527',
              'acf022b3-7f01-43b3-ac14-86f97d', '1664b920-5e60-4e3b-9aab-fe121c', '6c660b6b-bfda-47b0-9499-160d49',
              'e3d45dc6-66ef-4e0b-9d96-1b5db5', '64ee175f-f3ce-446e-bbf4-9b6fa8_D1', '33adae13-5dbd-4530-a5d5-3763e4',
              '53723086-8858-4395-93d7-0baa68', '1740224c-32d1-4c9f-98c6-653363', '885fe794-a98e-4f81-a284-ac4bb8']
ac_dropped_aliquots.sort()
print('True = lists equal each other (have all aliquots)')
ac_dropped_aliquots == pdc_acetyl

In [None]:
# Are the normal aliquots the same between omics? Yes
prot_dropped_aliquots == ac_dropped_aliquots

# Hnscc

Check correlation of replicates with each other (confirmed same tissue type).

In [None]:
h = pc.PancanHnscc()
hnscc = cptac.Hnscc()

In [None]:
# show which omics has duplicates
ca = h
source_name = 'umich'
prot = ca.get_proteomics(source_name)
phos = ca.get_phosphoproteomics(source_name)
#acetyl = ca.get_acetylproteomics(source_name) # not included in pdchnscc
print('prot:', set(prot.index.duplicated()), list(prot.index[prot.index.duplicated()]))
print('phospho:', set(phos.index.duplicated()), list(set(phos.index[phos.index.duplicated()])))
#print('acetyl:', set(acetyl.index.duplicated()))

In [None]:
# umich
df = prot.reset_index()
df = df.replace('-T-duplicate', '', regex = True)
df = df.replace('-N-duplicate\d?', '.N', regex = True)
df.loc[df.Patient_ID.str.contains('duplicate')]

In [None]:
# Keep aliquots
df = h.get_phosphoproteomics('pdc')
df = df.reset_index()
df = df.loc[df.Patient_ID.duplicated(keep = False)]
df = ut.reduce_multiindex(df, levels_to_drop = [1], flatten=True)
df = df.loc[:,~df.columns.duplicated()] # drop all duplicate prot or sites (dup once database ID dropped)
df.index = df.Patient_ID+'_'+df.aliquot_submitter_id # uncomment code that dropped aliquot in pancan to use
df = df.T
df = df.drop(['aliquot_submitter_id', 'Patient_ID','case_submitter_id'])
pcdf = rename_duplicate_cols(df)
pcdf.index = pcdf.index.str.upper()
pcdf

In [None]:
# compare correlations between pancan duplicates

df = pcdf
for col in df.columns:
    df[col] = pd.to_numeric(df[col])  

dup_list = ['C3L-02617', 'C3L-00994.N', 'C3L-02617.N'] 

for i in dup_list:
    if '.N' in i:
        dups = df.columns[df.columns.str.contains(i)].to_list()
    else:
        dups = df.columns[df.columns.str.contains(i) & ~ df.columns.str.contains('\.N')].to_list()
    
    print(i)
    first = dups[0]
    second = dups[1]
    
    plot_df = df[[first, second]].dropna(axis = 'index', how = 'any')
    xd = plot_df[first].min() + ((plot_df[first].max() - plot_df[first].min()) / 3) 
    yd = plot_df[second].max() + 2

    p.plot_pearson(plot_df, first, second, x_coor = xd, y_coor = yd, 
           hue = "none", title = "", ra_stats = True,
           show_plot = True)
        
    if len(dups) == 3:
        third = dups[2]
        plot_df2 = df[[second, third]].dropna(axis = 'index', how = 'any')
        xd = plot_df2[second].min() + ((plot_df2[second].max() - plot_df2[second].min()) / 3)
        yd = plot_df2[third].max() + 2                       
        p.plot_pearson(plot_df2, second, third, x_coor = xd, y_coor = yd,
               hue = "none", title = "", ra_stats = True,
               show_plot = True)
        
        plot_df3 = df[[first, third]].dropna(axis = 'index', how = 'any')
        xd = plot_df3[first].min() + ((plot_df3[first].max() - plot_df3[first].min()) / 3)
        yd = plot_df3[third].max() + 2
        p.plot_pearson(plot_df3, first, third, x_coor = xd, y_coor = yd,
               hue = "none", title = "", ra_stats = True,
               show_plot = True)

In [None]:
# Flagship
hnscc_dup = ['C3L-02617', 'C3L-00994.N', 'C3L-02617.N']
fdf = get_flag_df('phospho', hnscc, hnscc_dup)
fdf

In [None]:
both = fdf.join(pcdf, how = 'inner')
both.head()
both

In [None]:
for col in both.columns:
    both[col] = pd.to_numeric(both[col])  

In [None]:
# compare to flagship - doesn't correlate well
wrap_lin_reg(both, hnscc_dup, y_lab = source_name)

6 aliquots that didn't map

In [None]:
omics = 'prot'
#omics = 'phospho'
#omics = 'acetyl' 
not_mapped = ['CPT0163250003', 'CPT0169740004', 'CPT0229220002', 'CPT0235470002', 'CPT0278700002', 'CPT0281470002']
pcdf = get_pc_df(omics, h, 'pdc', dup_list = not_mapped, with_aliquots = True)
case_ids = pcdf.columns.to_list()
case_ids = [x[:-14] for x in case_ids]
norm_case_ids = [x+'.N' for x in case_ids]
cored_norm_ids = [x+'.C' for x in case_ids]

In [None]:
test_ids = cored_norm_ids
fdf = get_flag_df(omics, hnscc, test_ids)
fdf

In [None]:
both = fdf.join(pcdf, how = 'inner')
both = both.dropna(axis = 1, how = 'all')

for col in both.columns:
    both[col] = pd.to_numeric(both[col])  
both

In [None]:
# compare to flagship - doesn't correlate well
wrap_lin_reg(both, test_ids, y_lab = 'pdc')


In [None]:
tt = '.N'
df = both
dup_list = test_ids
for i in case_ids:
    try:
        ids = df.columns[df.columns.str.contains(i)].to_list()
        print(ids)
        ids.remove(i+tt+'_flagship')
        flag = i+tt+'_flagship'
        print(flag)

        for first in ids:
            plot_df = both[[flag, first]].dropna(axis = 'index', how = 'any')
            xd = plot_df[flag].min() + ((plot_df[flag].max() - plot_df[flag].min()) / 3) 
            yd = plot_df[first].max() +1


            p.plot_pearson(plot_df, flag, first, x_coor = xd, y_coor = yd, y_label = first+'_pdc',
                   hue = "none", title = "", ra_stats = True,
                   show_plot = True)
    except:
        continue
   

# Luad

In [None]:
l = pc.PancanLuad()
luad = cptac.Luad()

In [None]:
# show which omics has duplicates
ca = l
source_name = 'pdc'
prot = ca.get_proteomics(source_name)
phos = ca.get_phosphoproteomics(source_name)
acetyl = ca.get_acetylproteomics(source_name)
print('prot:', set(prot.index.duplicated()), list(prot.index[prot.index.duplicated()]))
print('phospho:', set(phos.index.duplicated()), list(phos.index[phos.index.duplicated()]))
print('acetyl:', set(acetyl.index.duplicated()), list(acetyl.index[acetyl.index.duplicated()]))

In [None]:
a=luad.get_acetylproteomics()
print(a.isnull().values.any())
a

In [None]:
a=l.get_acetylproteomics('pdc')
print(a.isnull().values.any())
a

In [None]:
def get_pc_df(omics_name, ca_obj, source_name, dup_list = [], with_aliquots = True):
    if omics_name == 'prot':
        df = ca_obj.get_proteomics(source_name)
    
    elif omics_name == 'phospho':
        df = ca_obj.get_phosphoproteomics(source_name)
        
    elif omics_name == 'acetyl':
        df = ca_obj.get_acetylproteomics(source_name)
        
    
    if with_aliquots == False:
        df = df.loc[df.index.duplicated(keep = False)]
        if omics_name != 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop=[1], flatten = True)
            df = df.loc[:,~df.columns.duplicated()]
        df = df.T
        df = df.drop(['aliquot_submitter_id', 'case_submitter_id'])
    
    # When using aliquots, pass in a list of patient_IDs
    else:
        df = df.reset_index()
        if omics_name != 'prot':
            df = ut.reduce_multiindex(df, levels_to_drop=[2], flatten = True)
            df = df.loc[:,~df.columns.duplicated()]
        # brca wasn't mapped (no .N) so use case_submitter_id
        if ca_obj.get_cancer_type() == 'pancanbrca': 
            df = df.loc[df.case_submitter_id.isin(dup_list)] # test with 21BR010 -> only 1 aliquot but part of 18 IDs with normal
            to_drop = ['case_submitter_id', 'aliquot_submitter_id']
            
        # luad and hnscc were mapped so use Patient_ID
        else: #if ca_obj.get_cancer_type() == 'pancanhnscc': 
            df = df.loc[df.Patient_ID.isin(dup_list)]
            to_drop = ['case_submitter_id', 'aliquot_submitter_id', 'Patient_ID']
            df.index = df.case_submitter_id+'_'+df.aliquot_submitter_id
        '''
        elif ca_obj.get_cancer_type() == 'pancanluad': 
            df = df.loc[df.Patient_ID.isin(dup_list)]
            to_drop = ['case_submitter_id_case_submitter_id', 'aliquot_submitter_id_aliquot_submitter_id', 'Patient_ID']
            df.index = df.case_submitter_id_case_submitter_id+'_'+df.aliquot_submitter_id_aliquot_submitter_id'''
            
        
        df = df.T
        df = df.drop(to_drop)
    
    df.index = df.index.str.upper()   
    df = rename_duplicate_cols(df)
        
    return df

In [None]:
#omics = 'phospho'
omics = 'acetyl'

dup_list = ['C3N-02379', 'C3N-02587']
pcdf = get_pc_df(omics, l, 'pdc', dup_list)
pcdf = pcdf.replace(0, np.nan)
pcdf

In [None]:
dup_list = ['C3N-02379', 'C3N-02587']
fdf = get_flag_df(omics, luad, dup_list)
fdf

In [None]:
both = fdf.join(pcdf, how = 'inner')
both.head()
both

for col in both.columns:
    both[col] = pd.to_numeric(both[col])  

wrap_lin_reg(both, dup_list, y_lab = 'pdc')

In [None]:
print(omics)
get_corr(both, dup_list)

# Pdac

In [None]:
pdc = pc.PancanPdac()
pdac = cptac.Pdac()

In [None]:
prot = pdc.get_proteomics('pdc')
not_mapped = prot.loc[prot.index.str.contains('CPT')]
aliquots = not_mapped.aliquot_submitter_id.to_list()
case_ids = not_mapped.case_submitter_id.to_list()
norm_case_ids = [x+'.N' for x in case_ids] # add normal 

In [None]:
#omics = 'phospho'
omics = 'prot'

pcdf = get_pc_df(omics, pdc, 'pdc', aliquots)
pcdf

In [None]:
fdf = get_flag_df(omics, pdac, norm_case_ids)
fdf

In [None]:
both = fdf.join(pcdf, how = 'inner')
both.head()
both

for col in both.columns:
    both[col] = pd.to_numeric(both[col])  
    
both

In [None]:
df = both
for i in case_ids:
    ids = df.columns[df.columns.str.contains(i)].to_list()
    ids.remove(i+'.N_flagship')
    flag = i+'.N_flagship'         

    try:
        for first in ids:
            plot_df = both[[flag, first]].dropna(axis = 'index', how = 'any')
            xd = plot_df[flag].min() + ((plot_df[flag].max() - plot_df[flag].min()) / 3) 
            yd = plot_df[first].max() +1

            p.plot_pearson(plot_df, flag, first, x_coor = xd, y_coor = yd, y_label = first+'_pdc',
                   hue = "none", title = "", ra_stats = True,
                   show_plot = True)
    except:
        continue

In [None]:
# umich compared to flagship

In [None]:
# get aliquot and case ID pair from pdc
prot = pdc.get_proteomics('pdc')
df = prot.loc[prot.index.str.contains('CPT')]
ac = df[['aliquot_submitter_id','case_submitter_id']].set_index('aliquot_submitter_id')
map_dict = ac.to_dict()['case_submitter_id']

In [None]:
# umich
omics = 'prot'
#omics = 'phospho'

if omics == 'phospho':
    df = pdc.get_phosphoproteomics('umich')
    df = ut.reduce_multiindex(df, levels_to_drop=[2,3], flatten = True)
    df = df.loc[:,~df.columns.duplicated()]
if omics == 'prot':
    df = pdc.get_proteomics('umich')
    df = ut.reduce_multiindex(df, levels_to_drop=[1], flatten = True)
    df = df.loc[:,~df.columns.duplicated()]
df = df.loc[df.index.isin(aliquots)]
df = df.rename(index = map_dict) 
df.index = df.index+'_umich'
umichdf = df.T
umichdf

In [None]:
both = umichdf.join(pcdf, how = 'inner')
both.head()
both

for col in both.columns:
    both[col] = pd.to_numeric(both[col])  
    
both

In [None]:
df = both
for i in case_ids:
    ids = df.columns[df.columns.str.contains(i)].to_list()
    ids.remove(i+'_umich')
    flag = i+'_umich'         

    try:
        for first in ids:
            plot_df = both[[flag, first]].dropna(axis = 'index', how = 'any')
            xd = plot_df[flag].min() + ((plot_df[flag].max() - plot_df[flag].min()) / 3) 
            yd = plot_df[first].max() +1

            p.plot_pearson(plot_df, flag, first, x_coor = xd, y_coor = yd, y_label = first+'_pdc',
                   hue = "none", title = "", ra_stats = True,
                   show_plot = True)
    except:
        continue

In [None]:
# umich correlated well with pdc, but aliquots in umich didn't map as well so we still 
# don't know the tissue type for these aliquots