In [30]:
import pandas as pd
import numpy as np
import warnings
from functools import reduce
import re

In [39]:
# average replicates
def average_replicates(df, common = '\.', to_drop = '\.\d$'):
    # common: regex string that is common between replicates
    # to_drop: regex string to drop to find each patient_ID with replicates (to slice out all replicates)
    replicate_df = df[df.index.str.contains(common)]
    patient_ids = pd.Series(replicate_df.index) # create series of replicate IDs to prep removing appended ".i"
    ids = patient_ids.replace(to_drop, '', regex=True)
    id_list = list(set(ids)) #id_list contains only patient_IDs of replicates (without #s)

    for patient_ID in id_list:
        id_df = df[df.index.str.contains(patient_ID)] # slice out replicates for a single patient
        vals = list(id_df.mean(axis=0)) 
        df.loc[patient_ID] = vals # add new row to original df with averages of replicates 

    df = df[~ df.index.str.contains(common)] # drop unaveraged replicate cols (averaged rows are kept)
    return df

In [47]:
cancer_names = ['GBM','LSCC','LUAD','EC','ccRCC','BR', 'HNSCC', 'OV'] #CO
ca = cancer_names[7]
file_name = ca+"_Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv" 
file_path = "../../../proteomics/prot_names/"+file_name

# Get df that maps aliquot_IDs to Patient_IDs
mapping_df = pd.read_csv('../../../input/aliquot_to_patient_ID.tsv', delimiter = '\t', index_col = 0)

In [51]:
all_df

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,HS3ST1-201,SEMA3F-201,CFTR-201,...,EFCAB14-208,MYO6-218,EED-213,STAT1-216,ZBTB3-204,PRX-205,WIZ-211,TSGA10-219,RFX7-204,HSPA12A-210
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,...,ENSG00000159658.13,ENSG00000196586.16,ENSG00000074266.21,ENSG00000115415.20,ENSG00000185670.9,ENSG00000105227.15,ENSG00000011451.21,ENSG00000135951.16,ENSG00000181827.15,ENSG00000165868.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01OV007,-0.527655,-0.380903,,-1.311341,-0.086341,0.025977,1.109507,-0.240346,,,...,0.231411,-0.154156,0.198634,-0.872461,,,-0.252536,,0.276181,1.647959
01OV017,0.320065,0.019606,1.426429,-0.807435,-0.104763,-0.253329,,,,-2.305889,...,-0.157979,-0.163544,-0.348380,-3.261571,,,-0.572754,,-0.059106,1.694668
01OV018,0.189254,-0.152288,-0.767595,-0.530582,-0.053460,0.041271,,,-0.423595,,...,-0.460277,0.083976,0.104768,0.328122,,,0.759240,,,0.786228
01OV023,0.158305,-0.018960,0.361530,-0.666655,-0.278404,-0.135457,-0.115583,,,,...,-0.411445,-0.222297,0.566934,-0.578624,,,-0.458496,,-0.020231,1.349137
01OV026,-0.268311,-0.071799,,-1.034479,-0.280523,0.098167,-0.056371,,0.162327,,...,-0.049722,0.315695,-0.122322,1.319497,,,-0.120384,-0.846895,,0.465927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14OV029.N,-0.548040,-0.308129,,-0.925171,-0.075288,0.074260,-0.157462,,,,...,-0.217788,-0.095648,0.194189,0.017896,0.839814,,-0.206143,,,1.205240
17OV001.N,0.655320,0.140041,-0.187128,0.569817,0.367049,0.743340,,,0.579353,0.023745,...,0.087221,0.722498,0.296255,0.683179,,,-0.111355,,-0.220782,-0.425013
17OV003.N,-0.534742,-0.524127,-0.098345,-1.208964,0.061149,-0.011026,0.390200,,,,...,-0.028314,-0.123066,0.065659,-1.528563,,,-0.460306,,-0.711129,1.039328
17OV004.N,0.280177,0.090590,,-0.976109,-0.281006,0.086104,,,-0.562525,,...,0.051253,-0.133336,-0.211110,1.116495,,,-0.228061,,-0.968338,0.367660


In [50]:
# OV

ov_map = pd.read_csv('../../../input/OV_sample_TMT_annotation_UMich_GENCODE34_0315.csv', delimiter = ',', index_col = 0)
ov_map = ov_map[['sample']].reset_index()
ov_map = ov_map.replace("#",".", regex = True)

if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) #index is protein identifier (duplicate)
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['JHU-QC', 'JHU-QC.1', 'JHU-QC.2', 'JHU-QC.3', 'JHU-QC.4',
       'RefInt_PNNL-JHU-Ref-1', 'RefInt_PNNL-JHU-Ref-2',
       'RefInt_PNNL-JHU-Ref-3', 'RefInt_PNNL-JHU-Ref-4',
       'RefInt_PNNL-JHU-Ref-5', 'RefInt_PNNL-JHU-Ref-6',
       'RefInt_PNNL-JHU-Ref-7', 'RefInt_PNNL-JHU-Ref-8',
       'RefInt_PNNL-JHU-Ref-9', 'RefInt_PNNL-JHU-Ref-10',
       'RefInt_PNNL-JHU-Ref-11', 'RefInt_PNNL-JHU-Ref-12']
    
    # Drop qauality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    index_list = list(df.index)
    ov_map = ov_map.loc[ov_map['index'].isin(index_list)]
    matched_ids = {}
    for i, row in ov_map.iterrows():
        matched_ids[row['index']] = row['sample']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    df.index = df.index.str.replace('-T$','')
    df.index = df.index.str.replace('-N$','.N')
        
    # Sort
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
    
    all_df = tumor.append(normal)

In [45]:
# HNSCC
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['128C', 'QC2', 'QC3', 'QC4', '129N', 'LungTumor1', 'Pooled-sample14',
       'LungTumor2', 'QC6', 'LungTumor3', 'Pooled-sample17', 'QC7',
       'Pooled-sample19', 'QC9', 'RefInt_pool01', 'RefInt_pool02',
       'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
       'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
       'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
       'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
       'RefInt_pool19', 'RefInt_pool20'] # has -C ("C3L-00994-C")
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Since cptac brca has no normal samples, the duplicates are treated as replicates
    df = average_replicates(df, common = '-duplicate', to_drop = '-[NT]-duplicate.*')
    
    df.index = df.index.str.replace('-T$','')
    df.index = df.index.str.replace('-N$','.N')
   
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [37]:
# BR
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['RetroIR', 'CPT0018460005', 'CPT0008140004', 'RetroIR.1',
       'RefInt_Pool01', 'RefInt_Pool02', 'RefInt_Pool03', 'RefInt_Pool04',
       'RefInt_Pool05', 'RefInt_Pool06', 'RefInt_Pool07', 'RefInt_Pool08',
       'RefInt_Pool09', 'RefInt_Pool10', 'RefInt_Pool11', 'RefInt_Pool12',
       'RefInt_Pool13', 'RefInt_Pool14', 'RefInt_Pool15', 'RefInt_Pool16',
       'RefInt_Pool17']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Since cptac brca has no normal samples, the duplicates are treated as replicates
    df = average_replicates(df)
   
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [29]:
# HCC
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    # Drop quality control and ref intensity cols
    drop_cols = df.index[-33:]
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [25]:
# ccRCC
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['NCI7-1', 'QC1', 'QC2', 'QC3', 'NCI7-2', 'NCI7-3', 'QC4', 'NCI7-4',
       'NCI7-5', 'QC5', 'QC6', 'QC7', 'QC8', 'RefInt_pool01', 'RefInt_pool02',
       'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
       'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
       'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
       'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
       'RefInt_pool19', 'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22',
       'RefInt_pool23']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [19]:
# EC 
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['NX1', 'NX2', 'NX3', 'NX4', 'NX5', 'NX6', 'NX7', 'NX8', 'NX9', 'NX12',
       'NX17', 'NX13', 'NX14', 'NX10', 'NX16', 'NX18', 'NX11', 'NX15',
       'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03', 'RefInt_pool04',
       'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07', 'RefInt_pool08',
       'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11', 'RefInt_pool12',
       'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15', 'RefInt_pool16',
       'RefInt_pool17']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [15]:
# LUAD
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['TumorOnlyIR01', 'NormalOnlyIR02', 'TumorOnlyIR03', 'NormalOnlyIR04',
       'CPT0148080004.1','NormalOnlyIR', 'TumorOnlyIR14',
       'TaiwaneseIR19', 'TumorOnlyIR21', 'TaiwaneseIR22', 'CPT0146580004.1',
       'NormalOnlyIR25', 'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03',
       'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07',
       'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11',
       'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15',
       'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18', 'RefInt_pool19',
       'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22', 'RefInt_pool23',
       'RefInt_pool24', 'RefInt_pool25']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [9]:
# LSCC
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['LUAD-Global-CR-pool1', 'LSCC-Tumor-ONLY-CR', 'JHU-HNSCC-CR',
       'LUAD-Global-CR-pool-1', 'LSCC-Tumor-ONLY-CR.1', 'JHU-HNSCC-CR.1',
       'LUAD-Global-CR-pool-2', 'JHU-HNSCC-CR.2', 'RefInt_LSCC-Global-CR',
       'RefInt_LSCC-Global-CR.1', 'RefInt_LSCC-Global-CR.2',
       'RefInt_LSCC-Global-CR.3', 'RefInt_LSCC-Global-CR.4',
       'RefInt_LSCC-Global-CR.5', 'RefInt_LSCC-Global-CR.6',
       'RefInt_LSCC-Global-CR.7', 'RefInt_LSCC-Global-CR.8',
       'RefInt_LSCC-Global-CR.9', 'RefInt_LSCC-Global-CR.10',
       'RefInt_LSCC-Global-CR.11', 'RefInt_LSCC-Global-CR.12',
       'RefInt_LSCC-Global-CR.13', 'RefInt_LSCC-Global-CR.14',
       'RefInt_LSCC-Global-CR.15', 'RefInt_LSCC-Global-CR.16',
       'RefInt_LSCC-Global-CR.17', 'RefInt_LSCC-Global-CR.18',
       'RefInt_LSCC-Global-CR.19', 'RefInt_LSCC-Global-CR.20',
       'RefInt_LSCC-Global-CR.21']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [6]:
#gbm
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['RefInt_01Pool', 'RefInt_02Pool', 'RefInt_03Pool', 'RefInt_04Pool',
                 'RefInt_05Pool', 'RefInt_06Pool', 'RefInt_07Pool', 'RefInt_08Pool',
                 'RefInt_09Pool', 'RefInt_10Pool', 'RefInt_11Pool']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('^PT-')]
    normal = normal.sort_values(by=["Patient_ID"])
    normal.index = normal.index +'.N' # append .N to normal IDs
    tumor = df.loc[~ df.index.str.contains('^PT-')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [7]:
all_df

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,HS3ST1-201,CYP51A1-201,USP28-201,...,ETNK1-207,AP1S2-209,EED-213,DDHD1-211,WIZ-210,ZBTB3-204,CTNND1-240,WIZ-212,WIZ-211,MSANTD2-209
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000002587.10,ENSG00000001630.17,ENSG00000048028.11,...,ENSG00000139163.16,ENSG00000182287.15,ENSG00000074266.21,ENSG00000100523.16,ENSG00000011451.21,ENSG00000185670.9,ENSG00000198561.15,ENSG00000011451.21,ENSG00000011451.21,ENSG00000120458.12
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3L-00104,-0.254733,-0.138938,0.662227,-0.074868,0.213529,0.123398,2.105458,,-0.660264,,...,-0.047874,-0.446007,0.070101,0.160090,0.129282,-0.166678,,-0.073448,-0.029020,-0.009463
C3L-00365,-0.138512,-0.824520,0.494419,0.043783,-0.001394,0.101477,-0.287232,,-0.402679,0.059975,...,0.350165,-0.844985,-0.234406,-0.448783,0.078407,,-0.174780,1.806950,0.033808,0.180578
C3L-00674,-0.351464,-0.120197,-0.084371,-0.260275,0.096193,-0.360274,0.227499,1.217058,-0.165751,0.010124,...,-0.167319,-0.096328,-0.118506,-0.107690,0.177530,,-1.513869,0.065973,-0.126149,0.465241
C3L-00677,-0.062869,0.094198,0.391070,-0.030638,0.742258,-0.417291,-0.013377,,-0.176649,0.535304,...,0.179200,0.320945,-0.027522,0.104278,0.049948,-0.590267,0.162686,1.964570,0.161229,0.283810
C3L-01040,-0.365351,0.070523,-0.472543,-0.255288,0.096844,0.356271,1.182940,,-0.307430,,...,0.098253,-0.298907,-0.201144,0.440215,0.110757,0.119013,,0.030719,0.066426,0.189187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PT-RN5K.N,0.582456,-0.356730,0.236333,0.658617,0.719931,-0.760723,1.562912,,-1.158580,-0.247034,...,-0.183210,-0.723977,-0.339088,0.973199,-0.753209,0.211861,-0.539658,,-0.530017,0.614323
PT-RU72.N,0.522376,0.226891,-1.059051,0.558740,0.893739,-0.383723,1.652233,,-0.649935,-0.504037,...,0.237765,-0.559902,-0.849461,1.305892,-0.706170,,-0.597932,-1.889779,-0.492358,-0.534980
PT-UTHO.N,0.595553,0.194616,-0.045162,0.546508,0.810285,-0.868148,0.721085,,-0.840286,-0.267181,...,0.018208,0.813018,-0.029398,1.355047,-0.600093,,0.066284,-0.095237,-0.286616,-0.090284
PT-WVLH.N,0.958226,-0.057340,,0.635282,1.143518,-1.243729,,,-0.185783,-0.402786,...,0.151603,0.380411,-0.758021,0.753353,-0.712901,-0.899465,-0.510523,,-1.241097,0.191439
