In [2]:
import pandas as pd
import numpy as np
import warnings
from functools import reduce
import re

In [3]:
# average replicates
def average_replicates(df, common = '\.', to_drop = '\.\d$'):
    # common: regex string that is common between replicates
    # to_drop: regex string to drop to find each patient_ID with replicates (to slice out all replicates)
    replicate_df = df[df.index.str.contains(common)]
    patient_ids = pd.Series(replicate_df.index) 
    ids = patient_ids.replace(to_drop, '', regex=True)
    id_list = list(set(ids)) 

    for patient_ID in id_list:
        id_df = df[df.index.str.contains(patient_ID)] # slice out replicates for a single patient
        vals = list(id_df.mean(axis=0)) 
        df.loc[patient_ID] = vals # add new row to original df with averages of replicates 

    df = df[~ df.index.str.contains(common)] # drop unaveraged replicate cols (averaged rows are kept)
    return df

In [4]:
cancer_names = ['GBM','LSCC','LUAD','EC','ccRCC','BR', 'HNSCC', 'OV', 'PDA', 'HCC'] #CO
ca = cancer_names[8]
file_name = ca+"_Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv" 
file_path = "../../../proteomics/prot_names/"+file_name

# Get df that maps aliquot_IDs to Patient_IDs
mapping_df = pd.read_csv('../../../input/aliquot_to_patient_ID.tsv', delimiter = '\t', index_col = 0)

In [48]:
mapping_df.loc[mapping_df['aliquot_ID'] == 'CPT0347760002']

Unnamed: 0,aliquot_ID,patient_ID


In [61]:
# HCC

if file_name == 'HCC_Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv':
    df = pd.read_csv(file_path, sep = "\t") 
    df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # Get protein identifier 
    df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # Get protein name 
    df = df.set_index(['Name', 'Database_ID']) # set multiindex
    df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary  columns
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values
    df.index.name = 'Patient_ID'
       
    # Drop qauality control and ref intensity 
    drop_cols = ['RefInt_MIX-01', 'RefInt_MIX-02', 'RefInt_MIX-03', 'RefInt_MIX-04',
       'RefInt_MIX-05', 'RefInt_MIX-06', 'RefInt_MIX-07', 'RefInt_MIX-08',
       'RefInt_MIX-09', 'RefInt_MIX-10', 'RefInt_MIX-11', 'RefInt_MIX-12',
       'RefInt_MIX-13', 'RefInt_MIX-14', 'RefInt_MIX-15', 'RefInt_MIX-16',
       'RefInt_MIX-17', 'RefInt_MIX-18', 'RefInt_MIX-19', 'RefInt_MIX-20',
       'RefInt_MIX-21', 'RefInt_MIX-22', 'RefInt_MIX-23', 'RefInt_MIX-24',
       'RefInt_MIX-25', 'RefInt_MIX-26', 'RefInt_MIX-27', 'RefInt_MIX-28',
       'RefInt_MIX-29', 'RefInt_MIX-30', 'RefInt_MIX-31', 'RefInt_MIX-32',
       'RefInt_MIX-33', 'ReferenceIntensity']
    df = df.drop(drop_cols, axis = 'index') 
    df = df.sort_values(by=["Patient_ID"])
    #self._data["proteomics"] = df


In [32]:
# PDA

if file_name == 'Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv':
    df = pd.read_csv(file_path, sep = "\t") 
    df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # Get protein identifier 
    df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # Get protein name 
    df = df.set_index(['Name', 'Database_ID']) # set multiindex
    df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary  columns
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values
    df.index.name = 'Patient_ID'
    
    # Drop qauality control and ref intensity 
    drop_cols = ['ReferenceIntensity', 'QC1', 'QC2', 'QC3', 'QC4', 'QC5', 'QC6', 'KoreanReference1',
       'KoreanReference2', 'KoreanReference3', 'Pool-24-2', 'WU-PDA1', 'WU-Pool-25']
    df = df.drop(drop_cols, axis = 'index')   
    self._data["proteomics"] = df

elif file_name == "aliquot_to_patient_ID.tsv":
    df = pd.read_csv(file_path, sep = "\t")
    self._helper_tables["map_ids"] = df

#### add after for loop

# Proteomics
# Get Patient_IDs 
# slice mapping_df to include cancer specific aliquot_IDs 
prot = self._data["proteomics"]
mapping_df = self._helper_tables["map_ids"]
index_list = list(prot.index)
cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]
# Create dictionary with aliquot_ID as keys and patient_ID as values
matched_ids = {}
for i, row in cancer_df.iterrows():
    matched_ids[row['aliquot_ID']] = row['patient_ID']
prot = prot.reset_index()
prot = prot.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
prot = prot.set_index('Patient_ID')

# Sort values
normal = prot.loc[prot.index.str.contains('\.N$', regex = True)]
normal = normal.sort_values(by=["Patient_ID"])
tumor = prot.loc[~ prot.index.str.contains('\.N$', regex = True)]
tumor = tumor.sort_values(by=["Patient_ID"])
all_prot = tumor.append(normal)
self._data["proteomics"] = all_prot

In [67]:
# PDA

if file_name == 'PDA_Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv':
    df = pd.read_csv(file_path, sep = "\t") 
    df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # Get protein identifier 
    df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # Get protein name 
    df = df.set_index(['Name', 'Database_ID']) # set multiindex
    df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary  columns
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values
    df.index.name = 'Patient_ID'
    
    # Drop qauality control and ref intensity 
    drop_cols = ['ReferenceIntensity', 'QC1', 'QC2', 'QC3', 'QC4', 'QC5', 'QC6', 'KoreanReference1',
       'KoreanReference2', 'KoreanReference3', 'Pool-24-2', 'WU-PDA1', 'WU-Pool-25']
    df = df.drop(drop_cols, axis = 'index')   
    #self._data["proteomics"] = df

# Proteomics
# Get Patient_IDs 
# slice mapping_df to include cancer specific aliquot_IDs 
prot = df
index_list = list(prot.index)
cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

# Create dictionary with aliquot_ID as keys and patient_ID as values
matched_ids = {}
for i, row in cancer_df.iterrows():
    matched_ids[row['aliquot_ID']] = row['patient_ID']
prot = prot.reset_index()
prot = prot.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
prot = prot.set_index('Patient_ID')

# Sort values
normal = prot.loc[prot.index.str.contains('\.N$', regex = True)]
normal = normal.sort_values(by=["Patient_ID"])
tumor = prot.loc[~ prot.index.str.contains('\.N$', regex = True)]
tumor = tumor.sort_values(by=["Patient_ID"])
all_prot = tumor.append(normal)
#self._data["proteomics"] = all_prot

In [68]:
all_prot.index.to_list()

['C3L-00017',
 'C3L-00102',
 'C3L-00189',
 'C3L-00277',
 'C3L-00401',
 'C3L-00589',
 'C3L-00598',
 'C3L-00599',
 'C3L-00622',
 'C3L-00625',
 'C3L-00640',
 'C3L-00819',
 'C3L-00881',
 'C3L-00928',
 'C3L-01031',
 'C3L-01036',
 'C3L-01037',
 'C3L-01051',
 'C3L-01052',
 'C3L-01053',
 'C3L-01054',
 'C3L-01124',
 'C3L-01328',
 'C3L-01453',
 'C3L-01598',
 'C3L-01637',
 'C3L-01662',
 'C3L-01687',
 'C3L-01689',
 'C3L-01703',
 'C3L-01971',
 'C3L-02109',
 'C3L-02112',
 'C3L-02115',
 'C3L-02116',
 'C3L-02118',
 'C3L-02463',
 'C3L-02604',
 'C3L-02606',
 'C3L-02610',
 'C3L-02613',
 'C3L-02701',
 'C3L-02809',
 'C3L-02890',
 'C3L-02897',
 'C3L-02899',
 'C3L-03123',
 'C3L-03129',
 'C3L-03356',
 'C3L-03371',
 'C3L-03388',
 'C3L-03394',
 'C3L-03395',
 'C3L-03628',
 'C3L-03630',
 'C3L-03632',
 'C3L-03635',
 'C3L-03639',
 'C3L-03743',
 'C3L-04027',
 'C3L-04072',
 'C3L-04080',
 'C3L-04473',
 'C3L-04475',
 'C3L-04479',
 'C3L-04495',
 'C3L-04848',
 'C3L-04853',
 'C3N-00198',
 'C3N-00249',
 'C3N-00302',
 'C3N-

In [49]:
#all_df
df.index[~ df.index.str.contains('CPT')]



Index(['C3L-01687-04', 'C3N-01900-04', 'C3L-00189-04', 'C3L-03632-04',
       'C3N-02573-03', 'C3N-00709-06', 'C3L-00401-04'],
      dtype='object', name='Patient_ID')

In [19]:
df

Name,ARF5,M6PR,ESRRA,FKBP4,NDUFAF7,FUCA2,HS3ST1,SEMA3F,CFTR,CYP51A1,...,ANK2,ATAD3B,EFCAB14,MYO6,EED,PCSK5,WIZ,RFX7,DPH1,SVIL
Database_ID,ENSP00000000233.5,ENSP00000000412.3,ENSP00000000442.6,ENSP00000001008.4,ENSP00000002125.4,ENSP00000002165.5,ENSP00000002596.5,ENSP00000002829.3,ENSP00000003084.6,ENSP00000003100.8,...,ENSP00000499982.1,ENSP00000500094.1,ENSP00000500581.1,ENSP00000500710.1,ENSP00000500914.1,ENSP00000500971.1,ENSP00000501300.1,ENSP00000501317.1,ENSP00000501368.1,ENSP00000501521.1
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
CPT0237840004,-0.191905,-0.049264,,0.123413,0.285921,0.201569,2.247712,0.582647,-0.487560,-0.113246,...,-0.539118,0.437113,0.176931,-0.372522,-0.096018,-0.421189,0.212116,,-0.215347,-1.896737
CPT0063990003,0.377384,-0.198055,,0.111746,0.559030,0.115949,-0.979133,0.061473,-1.064543,1.123007,...,0.421515,0.200130,0.006205,0.718279,-0.031261,2.774100,0.116756,,-0.084838,0.044490
CPT0166780004,0.202656,-0.428741,,0.118646,-0.132202,-0.736673,-0.626063,-0.498545,1.195598,0.336852,...,0.115630,0.311452,0.903435,-0.129404,-0.121978,-0.271093,0.155306,,0.451438,-1.422714
C3L-01687-04,0.218555,-0.404145,,-0.040822,0.412874,0.369709,-1.077224,-0.176325,-1.193372,-0.159018,...,0.586519,0.415732,-0.416118,0.660220,0.227264,2.535500,0.037520,,-0.179535,1.229343
CPT0166760004,0.137679,0.248505,,0.001393,-0.353165,-0.848973,-0.034003,0.546539,-0.980916,0.066646,...,-0.797131,0.144646,-0.501934,0.331757,0.119268,-0.633015,0.041884,,-0.297304,-1.314271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CPT0347760002,0.324060,0.182340,,0.286196,0.467876,0.379787,,,1.113918,1.123431,...,-0.068580,0.212202,0.828121,0.024606,-0.142292,,-0.521347,,0.630659,-1.501250
CPT0347790002,0.440757,-0.093522,,0.133985,0.369492,-0.143316,,,0.832406,0.958975,...,0.051463,-0.379740,0.557458,0.044320,-0.071018,,-0.322914,,0.235697,-1.540546
CPT0347820002,0.436447,-0.087110,,0.417974,0.262299,0.285969,,,0.608726,1.302991,...,-0.449568,0.805649,0.644360,0.098087,-0.254621,,-0.318773,,0.534944,-0.847774
CPT0347850002,0.294544,-0.197739,,0.153526,-0.036083,-0.829970,,,1.159943,0.959875,...,-0.309308,0.022943,1.094261,0.207828,-0.283773,,-0.232183,,0.063182,-0.578346


In [7]:
# OV
ov_map = pd.read_csv('../../../input/OV_sample_TMT_annotation_UMich_GENCODE34_0315.csv', delimiter = ',', index_col = 0)

file_name = 'OVCP_Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv'
file_path = '../../../input/'+file_name

In [37]:
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) #index is protein identifier (duplicate)
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
len(df.index)

120

In [38]:
df = df.loc[df.index[~ df.index.str.contains('JHU', regex = True)]]
df.index.to_list()
len(df.index)

103

In [36]:
    '''
    drop_cols = ['JHU-QC', 'JHU-QC.1', 'JHU-QC.2', 'JHU-QC.3', 'JHU-QC.4',
       'RefInt_PNNL-JHU-Ref-1', 'RefInt_PNNL-JHU-Ref-2',
       'RefInt_PNNL-JHU-Ref-3', 'RefInt_PNNL-JHU-Ref-4',
       'RefInt_PNNL-JHU-Ref-5', 'RefInt_PNNL-JHU-Ref-6',
       'RefInt_PNNL-JHU-Ref-7', 'RefInt_PNNL-JHU-Ref-8',
       'RefInt_PNNL-JHU-Ref-9', 'RefInt_PNNL-JHU-Ref-10',
       'RefInt_PNNL-JHU-Ref-11', 'RefInt_PNNL-JHU-Ref-12']'''
    
    # Drop quality control and ref intensity cols
    drop_cols = ['JHU-QC1', 'JHU-QC12', 'JHU-QC2', 'JHU-QC4', 'JHU-QC6', 'RefInt_PNNL-JHU-Ref-1-R', 
                'RefInt_PNNL-JHU-Ref-10-R', 'RefInt_PNNL-JHU-Ref-11-R', 'RefInt_PNNL-JHU-Ref-12-R', 
                'RefInt_PNNL-JHU-Ref-2-R', 'RefInt_PNNL-JHU-Ref-3-R', 'RefInt_PNNL-JHU-Ref-4-R', 
                'RefInt_PNNL-JHU-Ref-5-R', 'RefInt_PNNL-JHU-Ref-6-R', 'RefInt_PNNL-JHU-Ref-7-R', 
                'RefInt_PNNL-JHU-Ref-8-R', 'RefInt_PNNL-JHU-Ref-9-R']
len(drop_cols)
    

17

In [21]:
    # Drop qauality control and ref intensity cols
    #print(df.index.to_list())
    #df = df.drop(drop_cols, axis = 'index')    
    # Get Patient_IDs
    index_list = list(df.index)
    matched_ids = {}
    for i, row in ov_map.iterrows():
        matched_ids[row['specimen']] = row['sample']
    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')   
    df.index = df.index.str.replace('-T$','', regex =True)
    df.index = df.index.str.replace('-N$','.N', regex =True)
        
    # Sort
    normal = df.loc[df.index.str.contains('\.N$', regex =True)]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('\.N$', regex =True)]
    tumor = tumor.sort_values(by=["Patient_ID"])
    
    all_df = tumor.append(normal)

KeyError: "Index(['0c87932d-b6fd-4eb5-b86a-a34d07_D5', 'JHU-QC1',\n       '7069e990-4a4e-4984-a66d-4d835a_D5',\n       '41d0619e-b299-42e2-ac32-29e784_D5',\n       '7c7ce2d7-6c8a-424e-8e6c-d14ad1_D5',\n       'cfd76f9a-ce60-4ec7-8e6b-a8fc31_D2',\n       '1abbcb28-f221-4cd4-8c01-37b9bb_D1_D5',\n       'f6c215ea-5381-4521-bb61-be8e97_D5',\n       'bd266bdd-12e4-42cb-9c49-911bfc_D5', 'JHU-QC2',\n       ...\n       '2a237fd4-f966-4563-bab1-61dbd7_D2',\n       '42d860a3-8a0c-4c44-a09a-9ad8cf_D5',\n       '6c9a5bdc-0f0e-4cc8-aec5-63091c_D2',\n       'bd22174a-86c1-489f-bcb7-93e12b_D5',\n       '15524912-685a-47ce-a824-567848_D5',\n       '1c9145ad-1981-4871-aa7e-8b11de_D5',\n       'efc971f8-54c5-4531-a792-705ecf_D5',\n       'e3ebe010-cd31-4b5d-8fcb-b7103b_D5', 'JHU-QC12',\n       'd945689a-dcf3-44f4-a42f-1ded6d_D2'],\n      dtype='object', name='Patient_ID', length=108) not in index"

In [17]:
all_df.index.to_list()

['01OV007',
 '01OV017',
 '01OV018',
 '01OV023',
 '01OV026',
 '01OV029',
 '01OV030',
 '01OV039',
 '01OV041',
 '01OV047',
 '02OV001',
 '02OV005',
 '02OV006',
 '02OV008',
 '02OV015',
 '02OV022',
 '02OV023',
 '02OV029',
 '02OV032',
 '02OV036',
 '02OV041',
 '02OV044',
 '02OV046',
 '04OV001',
 '04OV004',
 '04OV005',
 '04OV008',
 '04OV011',
 '04OV012',
 '04OV013',
 '04OV017',
 '04OV018',
 '04OV021',
 '04OV023',
 '04OV024',
 '04OV027',
 '04OV028',
 '04OV031',
 '04OV033',
 '04OV036',
 '04OV037',
 '04OV039',
 '04OV040',
 '04OV044',
 '04OV045',
 '04OV048',
 '04OV049',
 '04OV050',
 '04OV051',
 '04OV053',
 '04OV054',
 '04OV055',
 '04OV057',
 '04OV058',
 '04OV063',
 '11OV002',
 '13OV003',
 '14OV011',
 '15OV001',
 '17OV001',
 '17OV002',
 '17OV010',
 '17OV011',
 '17OV013',
 '17OV014',
 '17OV015',
 '17OV017',
 '17OV018',
 '17OV025',
 '17OV027',
 '17OV028',
 '17OV029',
 '17OV030',
 '17OV033',
 '17OV036',
 '17OV039',
 '17OV040',
 '18OV001',
 '20OV005',
 '26OV002',
 '26OV008',
 '26OV009',
 '26OV011',
 '26

In [63]:
all_df.loc[['17OV002']]

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,HS3ST1-201,SEMA3F-201,CFTR-201,...,EFCAB14-208,MYO6-218,EED-213,STAT1-216,ZBTB3-204,PRX-205,WIZ-211,TSGA10-219,RFX7-204,HSPA12A-210
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,...,ENSG00000159658.13,ENSG00000196586.16,ENSG00000074266.21,ENSG00000115415.20,ENSG00000185670.9,ENSG00000105227.15,ENSG00000011451.21,ENSG00000135951.16,ENSG00000181827.15,ENSG00000165868.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
15OV001,0.032609,-0.320931,-0.29896,-0.687607,-0.099961,-0.053834,0.333935,2.605819,0.202834,,...,0.41684,0.079281,-0.023569,-0.37608,,,-0.346872,,-0.487372,0.69412


In [64]:
df = pd.read_csv(file_path, sep = "\t") 
df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) #index is protein identifier (duplicate)
df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
df = df.transpose()
ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
df = df.iloc[1:,:] # drop ReferenceIntensity row 
df.index.name = 'Patient_ID'

In [65]:
df.loc[df.index[df.index.str.contains('17OV002')]]

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,HS3ST1-201,SEMA3F-201,CFTR-201,...,EFCAB14-208,MYO6-218,EED-213,STAT1-216,ZBTB3-204,PRX-205,WIZ-211,TSGA10-219,RFX7-204,HSPA12A-210
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,...,ENSG00000159658.13,ENSG00000196586.16,ENSG00000074266.21,ENSG00000115415.20,ENSG00000185670.9,ENSG00000105227.15,ENSG00000011451.21,ENSG00000135951.16,ENSG00000181827.15,ENSG00000165868.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
15OV001,0.082198,-0.420185,,-0.668637,-0.10065,-0.078726,0.333935,2.605819,,,...,0.729744,0.158871,-0.000748,0.015541,,,-0.196916,,-0.487372,0.183542
15OV001.1,-0.016981,-0.221676,-0.29896,-0.706577,-0.099273,-0.028941,,,0.202834,,...,0.103936,-0.00031,-0.046389,-0.767702,,,-0.496828,,,1.204697


In [52]:
all_df

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,HS3ST1-201,SEMA3F-201,CFTR-201,...,EFCAB14-208,MYO6-218,EED-213,STAT1-216,ZBTB3-204,PRX-205,WIZ-211,TSGA10-219,RFX7-204,HSPA12A-210
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,...,ENSG00000159658.13,ENSG00000196586.16,ENSG00000074266.21,ENSG00000115415.20,ENSG00000185670.9,ENSG00000105227.15,ENSG00000011451.21,ENSG00000135951.16,ENSG00000181827.15,ENSG00000165868.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01OV007,-0.527655,-0.380903,,-1.311341,-0.086341,0.025977,1.109507,-0.240346,,,...,0.231411,-0.154156,0.198634,-0.872461,,,-0.252536,,0.276181,1.647959
01OV017,0.320065,0.019606,1.426429,-0.807435,-0.104763,-0.253329,,,,-2.305889,...,-0.157979,-0.163544,-0.348380,-3.261571,,,-0.572754,,-0.059106,1.694668
01OV018,0.189254,-0.152288,-0.767595,-0.530582,-0.053460,0.041271,,,-0.423595,,...,-0.460277,0.083976,0.104768,0.328122,,,0.759240,,,0.786228
01OV023,0.158305,-0.018960,0.361530,-0.666655,-0.278404,-0.135457,-0.115583,,,,...,-0.411445,-0.222297,0.566934,-0.578624,,,-0.458496,,-0.020231,1.349137
01OV026,-0.268311,-0.071799,,-1.034479,-0.280523,0.098167,-0.056371,,0.162327,,...,-0.049722,0.315695,-0.122322,1.319497,,,-0.120384,-0.846895,,0.465927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14OV029.N,-0.548040,-0.308129,,-0.925171,-0.075288,0.074260,-0.157462,,,,...,-0.217788,-0.095648,0.194189,0.017896,0.839814,,-0.206143,,,1.205240
17OV001.N,0.655320,0.140041,-0.187128,0.569817,0.367049,0.743340,,,0.579353,0.023745,...,0.087221,0.722498,0.296255,0.683179,,,-0.111355,,-0.220782,-0.425013
17OV003.N,-0.534742,-0.524127,-0.098345,-1.208964,0.061149,-0.011026,0.390200,,,,...,-0.028314,-0.123066,0.065659,-1.528563,,,-0.460306,,-0.711129,1.039328
17OV004.N,0.280177,0.090590,,-0.976109,-0.281006,0.086104,,,-0.562525,,...,0.051253,-0.133336,-0.211110,1.116495,,,-0.228061,,-0.968338,0.367660


In [55]:
all_df[all_df.index.str.contains('17OV002')]

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,HS3ST1-201,SEMA3F-201,CFTR-201,...,EFCAB14-208,MYO6-218,EED-213,STAT1-216,ZBTB3-204,PRX-205,WIZ-211,TSGA10-219,RFX7-204,HSPA12A-210
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,...,ENSG00000159658.13,ENSG00000196586.16,ENSG00000074266.21,ENSG00000115415.20,ENSG00000185670.9,ENSG00000105227.15,ENSG00000011451.21,ENSG00000135951.16,ENSG00000181827.15,ENSG00000165868.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
17OV002,0.571243,0.430154,,-0.116476,0.040333,0.316,,,0.843701,,...,0.179753,0.003628,-0.031299,0.563513,0.902538,0.735546,0.145854,,0.031075,-0.228774


In [22]:
# HNSCC
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['128C', 'QC2', 'QC3', 'QC4', '129N', 'LungTumor1', 'Pooled-sample14',
       'LungTumor2', 'QC6', 'LungTumor3', 'Pooled-sample17', 'QC7',
       'Pooled-sample19', 'QC9', 'RefInt_pool01', 'RefInt_pool02',
       'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
       'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
       'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
       'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
       'RefInt_pool19', 'RefInt_pool20']
    
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')

    # duplicates are averaged
    df = average_replicates(df, common = '-duplicate', to_drop = '-duplicate.*')
  

    df.index = df.index.str.replace('-T$','', regex = True)
    df.index = df.index.str.replace('-N$','.N', regex = True)
    df.index = df.index.str.replace('-C$','.C', regex = True) # 6 cored normal samples 

    # Sort values
    normal = df.loc[df.index.str.contains('\.[NC]$', regex = True)]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('\.[NC]$', regex = True)]
    tumor = tumor.sort_values(by=["Patient_ID"])

    all_df = tumor.append(normal)

In [23]:
all_df.index[all_df.index.str.contains('\.C')]

Index(['C3L-00994.C', 'C3L-02617.C', 'C3L-04350.C', 'C3L-05257.C',
       'C3N-01757.C', 'C3N-03042.C'],
      dtype='object', name='Patient_ID')

In [20]:
# Check if there are duplicates in index
if True in set(all_df.index.duplicated()):
    print('duplicates')
    print(all_df.index[all_df.index.duplicated()])
else:
    print('NO duplicates in index')
    

NO duplicates in index


In [37]:
# BR
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['RetroIR', 'CPT0018460005', 'CPT0008140004', 'RetroIR.1',
       'RefInt_Pool01', 'RefInt_Pool02', 'RefInt_Pool03', 'RefInt_Pool04',
       'RefInt_Pool05', 'RefInt_Pool06', 'RefInt_Pool07', 'RefInt_Pool08',
       'RefInt_Pool09', 'RefInt_Pool10', 'RefInt_Pool11', 'RefInt_Pool12',
       'RefInt_Pool13', 'RefInt_Pool14', 'RefInt_Pool15', 'RefInt_Pool16',
       'RefInt_Pool17']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Since cptac brca has no normal samples, the duplicates are treated as replicates
    df = average_replicates(df)
   
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [29]:
# HCC
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    # Drop quality control and ref intensity cols
    drop_cols = df.index[-33:]
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [7]:
# ccRCC
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['NCI7-1', 'QC1', 'QC2', 'QC3', 'NCI7-2', 'NCI7-3', 'QC4', 'NCI7-4',
       'NCI7-5', 'QC5', 'QC6', 'QC7', 'QC8', 'RefInt_pool01', 'RefInt_pool02',
       'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
       'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
       'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
       'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
       'RefInt_pool19', 'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22',
       'RefInt_pool23']
    '''
    ['NCI7-1','NCI7-2','NCI7-3','NCI7-4','NCI7-5', 'QC1', 'QC2', 'QC3', 'QC4', 'QC5', 'QC6', 'QC7', 
    'QC8', 'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 
    'RefInt_pool06', 'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11', 
    'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 
    'RefInt_pool18', 'RefInt_pool19', 'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22', 'RefInt_pool23']'''
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$', regex = True)]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$', regex = True)]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [8]:
all_df

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,CFTR-201,CYP51A1-201,USP28-201,...,ANK2-282,ANK2-275,ATXN2-240,ETNK1-207,MYO6-218,GBF1-204,CTNND1-240,PRX-205,WIZ-211,HSPA12A-210
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000001626.16,ENSG00000001630.17,ENSG00000048028.11,...,ENSG00000145362.20,ENSG00000145362.20,ENSG00000204842.18,ENSG00000139163.16,ENSG00000196586.16,ENSG00000107862.5,ENSG00000198561.15,ENSG00000105227.15,ENSG00000011451.21,ENSG00000165868.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3L-00004,0.248796,0.308289,-0.315825,0.051528,-0.323475,-0.720420,,-0.832085,-0.629139,-0.317251,...,0.359093,1.609902,0.054346,-0.471825,-0.095824,0.240579,0.689392,,-0.072279,-0.655139
C3L-00010,0.245087,0.131992,-0.016811,-0.114728,-0.816992,0.104260,,1.266034,-0.573329,-0.172712,...,-1.592004,-0.542294,,-0.208478,-0.143536,0.049889,0.499405,1.422138,-0.058976,-0.189293
C3L-00011,0.576821,0.443678,-0.544452,0.120776,-0.075035,-0.738826,,-0.484732,-0.725274,1.079570,...,-0.523301,-0.389680,,0.550085,-1.617542,0.485815,-0.327129,,0.344999,-0.688779
C3L-00026,0.293042,0.250350,0.142924,-0.173812,0.589458,0.238597,0.957386,-3.178976,0.372489,0.404721,...,-0.249875,-1.492455,,0.005406,-0.281776,0.057764,-0.073197,,-0.159555,-0.023920
C3L-00079,0.174315,0.026805,-0.663191,0.075943,-0.240099,-0.293380,,0.008966,-0.469829,,...,-1.423105,-0.921139,,-0.042932,-0.921862,0.385364,,,0.387207,-1.057636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646.N,0.037103,-0.036081,0.155231,0.022887,0.398078,0.455271,,-0.061556,1.276607,,...,0.779052,-0.223435,,0.092663,0.241097,-0.001744,0.031517,,-0.066332,0.262387
C3N-01648.N,0.002415,-0.067960,0.179792,-0.047536,0.072494,-0.142255,,0.561636,0.204457,0.139775,...,0.315899,-0.628753,,0.155145,0.279064,0.055534,-0.020708,,-0.075991,0.257819
C3N-01649.N,0.147900,-0.091094,0.234044,0.255129,-0.271717,-0.361462,,,0.618059,-0.206104,...,0.834806,,,0.262559,0.228916,0.049539,0.144912,0.673794,-0.124297,0.202749
C3N-01651.N,-0.085619,0.077062,0.264802,0.016510,0.553156,0.205408,-0.169571,-0.385380,0.558438,-0.012665,...,0.604100,,,0.240663,0.590784,0.050447,0.189749,,-0.148738,0.595645


In [45]:
# Check if there are duplicates in index
if True in set(all_df.index.duplicated()):
    print('duplicates')
    #print(p.index[p.index.duplicated()])
else:
    print('NO duplicates in index')
    

NO duplicates in index


In [14]:
file_path

"'../../../ec_prot/Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv"

In [15]:
file_name = "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv" 
file_path = "../../../ec_prot/"+file_name

# Get df that maps aliquot_IDs to Patient_IDs
mapping_df = pd.read_csv('../../../input/aliquot_to_patient_ID.tsv', delimiter = '\t', index_col = 0)

In [21]:
# EC 
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Name', 'Gene':'Database_ID'})
    df = df.set_index(['Name', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
      
    '''
    # Drop quality control and ref intensity cols     
    drop_cols = ['NX1', 'NX2', 'NX3', 'NX4', 'NX5', 'NX6', 'NX7', 'NX8', 'NX9', 'NX12',
       'NX17', 'NX13', 'NX14', 'NX10', 'NX16', 'NX18', 'NX11', 'NX15',
       'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03', 'RefInt_pool04',
       'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07', 'RefInt_pool08',
       'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11', 'RefInt_pool12',
       'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15', 'RefInt_pool16',
       'RefInt_pool17']    
    df = df.drop(drop_cols, axis = 'index')'''
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]
    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']
    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # C3N-01825 comes from two tumor aliquots, so we average these 
    id_df = df[df.index.str.contains('C3N-01825')] 
    vals = list(id_df.mean(axis=0)) # average replicates and store in list 
    df = df.drop(index = 'C3N-01825') # drop both replicates so can add new row with averages
    df.loc['C3N-01825'] = vals 
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [27]:
all_df[~ all_df.index.str.contains('-')]

Name,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,SEMA3F-201,CFTR-201,CYP51A1-201,...,SCRIB-210,WIZ-212,BPIFB4-204,LDB1-204,WIZ-211,TSGA10-219,RFX7-204,SWSAP1-202,MSANTD2-209,SVIL-215
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000001617.12,ENSG00000001626.16,ENSG00000001630.17,...,ENSG00000180900.20,ENSG00000011451.21,ENSG00000186191.8,ENSG00000198728.11,ENSG00000011451.21,ENSG00000135951.16,ENSG00000181827.15,ENSG00000173928.4,ENSG00000120458.12,ENSG00000197321.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
NX1,0.218979,0.230552,0.134694,0.295812,-0.404386,0.006551,0.61697,0.237973,,-0.209813,...,,-0.251622,,0.314284,0.263701,-0.301592,-0.061258,-0.201291,-0.077984,
NX10,0.070668,-0.351918,0.055249,-0.387262,0.111305,-0.079602,,0.106234,0.904879,-0.003566,...,-0.267481,-0.513496,,0.018843,0.122771,-0.099385,-0.21161,,0.523707,-1.932152
NX11,0.119159,0.007565,-0.000268,0.012744,0.011574,0.43253,,1.030372,1.293901,0.537394,...,-0.623401,-0.358025,,0.534054,0.27696,-0.181137,0.001884,,0.647953,-2.979861
NX12,0.150888,0.08207,0.017507,-0.13822,0.17533,0.399082,,0.569482,1.020064,-0.0778,...,0.239249,-0.282146,,0.27121,0.126338,-0.45028,-0.215497,,0.117687,-2.686992
NX13,0.10798,0.070007,-0.051188,-0.064077,0.171924,0.201895,,0.826469,1.244531,0.214046,...,-0.691511,0.001951,,0.281565,0.356249,-0.303213,-0.103934,,0.559952,-3.244786
NX14,0.114779,-0.154477,0.100423,0.327687,-0.046173,0.366557,,0.059492,-0.218802,0.928876,...,-0.322385,0.300339,,-0.15643,0.329596,0.452392,-0.427219,,0.008844,-3.417148
NX15,0.145981,0.153259,-0.120398,0.100296,-0.163539,-0.15689,,0.780944,-0.204144,0.798022,...,-0.727259,0.177029,,0.02056,0.287766,-0.332035,-0.13751,,0.332546,-3.364447
NX16,0.14721,0.188932,-0.0888,-0.59597,0.291153,-0.003585,,-0.028129,0.812784,0.04555,...,-0.456673,0.03173,,-0.318148,-0.030071,0.483646,-0.482526,,0.190865,-2.250847
NX17,0.111588,-0.065484,0.379661,-0.405439,0.14034,-0.498067,,-0.379743,-1.270279,-0.088475,...,0.278497,-0.820497,,-0.152431,-0.041868,-0.023522,-0.281508,,-0.033746,-2.541813
NX18,0.214103,-0.123969,-0.259805,0.324413,-0.245544,0.339968,,0.764082,-0.090232,0.537681,...,-0.313427,0.109757,,-0.06304,0.296197,0.080938,-0.260303,,0.123574,-3.436891


In [44]:

all_df[all_df.index.str.contains('C3N-01825')]

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,SEMA3F-201,CFTR-201,CYP51A1-201,...,SCRIB-210,WIZ-212,BPIFB4-204,LDB1-204,WIZ-211,TSGA10-219,RFX7-204,SWSAP1-202,MSANTD2-209,SVIL-215
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000001617.12,ENSG00000001626.16,ENSG00000001630.17,...,ENSG00000180900.20,ENSG00000011451.21,ENSG00000186191.8,ENSG00000198728.11,ENSG00000011451.21,ENSG00000135951.16,ENSG00000181827.15,ENSG00000173928.4,ENSG00000120458.12,ENSG00000197321.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3N-01825,-0.277652,-0.114247,-0.191063,-0.10877,0.408894,0.095893,0.151725,0.303608,,-0.681455,...,0.026961,-0.180579,,-0.192909,-0.113818,-0.518759,0.140582,0.104568,0.2505,0.637603


In [47]:
# Check if there are duplicates in index
if True in set(all_df.index.duplicated()):
    print('duplicates')
    #print(p.index[p.index.duplicated()])
else:
    print('NO duplicates in index')
    

NO duplicates in index


In [28]:
# LUAD
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['TumorOnlyIR01', 'NormalOnlyIR02', 'TumorOnlyIR03', 'NormalOnlyIR04',
       'CPT0148080004.1','NormalOnlyIR', 'TumorOnlyIR14',
       'TaiwaneseIR19', 'TumorOnlyIR21', 'TaiwaneseIR22', 'CPT0146580004.1',
       'NormalOnlyIR25', 'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03',
       'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07',
       'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11',
       'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15',
       'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18', 'RefInt_pool19',
       'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22', 'RefInt_pool23',
       'RefInt_pool24', 'RefInt_pool25']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

KeyError: "['TumorOnlyIR01' 'NormalOnlyIR02' 'TumorOnlyIR03' 'NormalOnlyIR04'\n 'CPT0148080004.1' 'NormalOnlyIR' 'TumorOnlyIR14' 'TaiwaneseIR19'\n 'TumorOnlyIR21' 'TaiwaneseIR22' 'CPT0146580004.1' 'NormalOnlyIR25'\n 'RefInt_pool18' 'RefInt_pool19' 'RefInt_pool20' 'RefInt_pool21'\n 'RefInt_pool22' 'RefInt_pool23' 'RefInt_pool24' 'RefInt_pool25'] not found in axis"

In [9]:
# LSCC
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['LUAD-Global-CR-pool1', 'LSCC-Tumor-ONLY-CR', 'JHU-HNSCC-CR',
       'LUAD-Global-CR-pool-1', 'LSCC-Tumor-ONLY-CR.1', 'JHU-HNSCC-CR.1',
       'LUAD-Global-CR-pool-2', 'JHU-HNSCC-CR.2', 'RefInt_LSCC-Global-CR',
       'RefInt_LSCC-Global-CR.1', 'RefInt_LSCC-Global-CR.2',
       'RefInt_LSCC-Global-CR.3', 'RefInt_LSCC-Global-CR.4',
       'RefInt_LSCC-Global-CR.5', 'RefInt_LSCC-Global-CR.6',
       'RefInt_LSCC-Global-CR.7', 'RefInt_LSCC-Global-CR.8',
       'RefInt_LSCC-Global-CR.9', 'RefInt_LSCC-Global-CR.10',
       'RefInt_LSCC-Global-CR.11', 'RefInt_LSCC-Global-CR.12',
       'RefInt_LSCC-Global-CR.13', 'RefInt_LSCC-Global-CR.14',
       'RefInt_LSCC-Global-CR.15', 'RefInt_LSCC-Global-CR.16',
       'RefInt_LSCC-Global-CR.17', 'RefInt_LSCC-Global-CR.18',
       'RefInt_LSCC-Global-CR.19', 'RefInt_LSCC-Global-CR.20',
       'RefInt_LSCC-Global-CR.21']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [6]:
#gbm
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") 
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) 
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    drop_cols = ['RefInt_01Pool', 'RefInt_02Pool', 'RefInt_03Pool', 'RefInt_04Pool',
                 'RefInt_05Pool', 'RefInt_06Pool', 'RefInt_07Pool', 'RefInt_08Pool',
                 'RefInt_09Pool', 'RefInt_10Pool', 'RefInt_11Pool']
    
    # Drop quality control and ref intensity cols
    df = df.drop(drop_cols, axis = 'index')
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]

    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # Sort values
    normal = df.loc[df.index.str.contains('^PT-')]
    normal = normal.sort_values(by=["Patient_ID"])
    normal.index = normal.index +'.N' # append .N to normal IDs
    tumor = df.loc[~ df.index.str.contains('^PT-')]
    tumor = tumor.sort_values(by=["Patient_ID"])
        
    all_df = tumor.append(normal)

In [7]:
all_df

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,DBNDD1-201,HS3ST1-201,CYP51A1-201,USP28-201,...,ETNK1-207,AP1S2-209,EED-213,DDHD1-211,WIZ-210,ZBTB3-204,CTNND1-240,WIZ-212,WIZ-211,MSANTD2-209
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000003249.14,ENSG00000002587.10,ENSG00000001630.17,ENSG00000048028.11,...,ENSG00000139163.16,ENSG00000182287.15,ENSG00000074266.21,ENSG00000100523.16,ENSG00000011451.21,ENSG00000185670.9,ENSG00000198561.15,ENSG00000011451.21,ENSG00000011451.21,ENSG00000120458.12
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3L-00104,-0.254733,-0.138938,0.662227,-0.074868,0.213529,0.123398,2.105458,,-0.660264,,...,-0.047874,-0.446007,0.070101,0.160090,0.129282,-0.166678,,-0.073448,-0.029020,-0.009463
C3L-00365,-0.138512,-0.824520,0.494419,0.043783,-0.001394,0.101477,-0.287232,,-0.402679,0.059975,...,0.350165,-0.844985,-0.234406,-0.448783,0.078407,,-0.174780,1.806950,0.033808,0.180578
C3L-00674,-0.351464,-0.120197,-0.084371,-0.260275,0.096193,-0.360274,0.227499,1.217058,-0.165751,0.010124,...,-0.167319,-0.096328,-0.118506,-0.107690,0.177530,,-1.513869,0.065973,-0.126149,0.465241
C3L-00677,-0.062869,0.094198,0.391070,-0.030638,0.742258,-0.417291,-0.013377,,-0.176649,0.535304,...,0.179200,0.320945,-0.027522,0.104278,0.049948,-0.590267,0.162686,1.964570,0.161229,0.283810
C3L-01040,-0.365351,0.070523,-0.472543,-0.255288,0.096844,0.356271,1.182940,,-0.307430,,...,0.098253,-0.298907,-0.201144,0.440215,0.110757,0.119013,,0.030719,0.066426,0.189187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PT-RN5K.N,0.582456,-0.356730,0.236333,0.658617,0.719931,-0.760723,1.562912,,-1.158580,-0.247034,...,-0.183210,-0.723977,-0.339088,0.973199,-0.753209,0.211861,-0.539658,,-0.530017,0.614323
PT-RU72.N,0.522376,0.226891,-1.059051,0.558740,0.893739,-0.383723,1.652233,,-0.649935,-0.504037,...,0.237765,-0.559902,-0.849461,1.305892,-0.706170,,-0.597932,-1.889779,-0.492358,-0.534980
PT-UTHO.N,0.595553,0.194616,-0.045162,0.546508,0.810285,-0.868148,0.721085,,-0.840286,-0.267181,...,0.018208,0.813018,-0.029398,1.355047,-0.600093,,0.066284,-0.095237,-0.286616,-0.090284
PT-WVLH.N,0.958226,-0.057340,,0.635282,1.143518,-1.243729,,,-0.185783,-0.402786,...,0.151603,0.380411,-0.758021,0.753353,-0.712901,-0.899465,-0.510523,,-1.241097,0.191439
