In [1]:
import pandas as pd
import numpy as np
import warnings
from functools import reduce
import re

In [2]:
import cptac
import cptac.dataframe_tools as dt

In [3]:
# average replicates
def average_replicates(df):
    replicate_df = df[df.index.str.contains('#')]
    patient_ids = pd.Series(replicate_df.index)
    ids = patient_ids.replace('#\d', '', regex=True)
    id_list = list(set(ids)) #id_list contains only patient_IDs of replicates (without #s)

    for patient_ID in id_list:
        id_df = df[df.index.str.contains(patient_ID)] # slice out replicates for a single patient
        vals = list(id_df.mean(axis=0)) 
        df.loc[patient_ID] = vals # add new row to original df with averages of replicates 

    df = df[~ df.index.str.contains('#')] # drop unaveraged replicate cols (averaged rows are kept)
    return df

# Get mapper for ids

In [259]:
mapper = "pdc_aliquot_2021-03-02_15_58.tsv.xlsx" 
m_path = "../../../input/"+mapper

In [298]:
m = pd.read_excel(m_path, sep = "\t", na_values = 'NA') # type, project_id, and analyte_type have all same vals
#m['samples.submitter_id#2'].unique()

In [398]:
m = m.rename(columns = {'submitter_id':'a', 'samples.submitter_id#1':'1',
                       'samples.submitter_id#2':'2', 'samples.submitter_id#3':'3',
                       'samples.submitter_id#4':'4', 'samples.submitter_id#5':'5'})
n = m[['a','1','2','3','4','5']] # cols 1-5 are same id with "-##" for replicate?

In [313]:
nd = {}
for i, row in n.iterrows():
    nd[row['a']] = row[1][1:]

In [315]:
#nd

In [300]:
allmap = 'pdc_sample_2021-03-05_16_43.tsv.txt'
am_path = "../../../input/"+allmap

In [301]:
am = pd.read_csv(am_path, sep = "\t", na_values = 'NA') # type, project_id, and analyte_type have all same vals

# Make patient_ID vals (case_id + .N if normal sample) 
am['id'] = am['cases.submitter_id'] +'_'+ am['tissue_type']
am['id'] = am['id'].str.replace('_Tumor$','')
am['id'] = am['id'].str.replace('_Normal$','.N')

array(['Tumor', 'Normal'], dtype=object)

In [None]:
# Merge dfs
# set index to case_id with aliquot/tissue type specific number (-##)
n = n.set_index('1')
am = am.set_index('submitter_id')
jn = n.join(am, how = 'outer')

In [305]:
jn = jn[['a', 'id']] # keep aliquot (in proteomics) and newly created patient_id
jn

Unnamed: 0,a,id
2f2e5477-42a4-4906-a943-bf7f80,CPT0051690004,11LU035
5a84eae1-197e-4463-ad65-59becc,CPT0052170004,11LU022
93e30fd5-e57e-4503-a175-863c7d,CPT0052940004,11LU016
9f905736-f662-41d6-b3ac-16758d,CPT0053040004,11LU013
C3L-00001-02,CPT0001580009,C3L-00001
...,...,...
C3N-05923-01,CPT032238 0003,C3N-05923
C3N-05923-09,CPT032239 0003,C3N-05923.N
C3N-05929-03,CPT032253 0003,C3N-05929
C3N-05929-05,CPT032254 0003,C3N-05929.N


In [386]:
# dictionary with aliquot id keys and patient_id valuess
matched_ids = {}
for i, row in jn.iterrows():
    matched_ids[row['a']] = row[-1]

# Proteomics (Genecode) parser

In [212]:
class SliceableDict(dict):
    default = None
    def __getitem__(self, key):
        if isinstance(key, list): 
            # omits key if it does not exist
            return {k: self[k] for k in key if k in self}
        return dict.get(self, key)

In [402]:
drop_cols = {'GBM':gbm_dif,'HNSCC':hnscc_dif,'LSCC':lscc_dif,'LUAD':luad_dif,
             'BR':br_dif,'EC':ec_dif,'CO':co_dif,'OV':ov_dif,'ccRCC':ccrcc_dif} #add HCC

In [430]:
cancer_names = ['GBM','HNSCC','LSCC','LUAD','BR','EC','CO','OV','ccRCC'] # HCC (not in drop-cols)
ca = cancer_names[1]
file_name = ca+"_Report_abundance_groupby=gene_protNorm=MD_gu=2.tsv" 
file_path = "../../../proteomics/"+file_name

In [431]:
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") # CHECK na_vals 'NA' already default for read_csv
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) #index is protein identifier (duplicate)
    df.Proteins = df.Proteins.apply(lambda x: x.split('|')[6]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Proteins':'Name', 'Index':'Database_ID'})
    df = df.set_index(['Name', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    df.index = df.index.str.replace('-T$','')
    df.index = df.index.str.replace('-N$','.N')
    
    # Match ids to get Patient_ID
    # Create cancer specific dict
    indices = list(df.index)
    sliced = SliceableDict(matched_ids)
    rn_list = sliced[indices]
    
    #replace with cancer specific dictionary 
    df = df.reset_index()
    df = df.replace(rn_list) 
    df = df.set_index('Patient_ID')
    
    df = df.drop(drop_cols[ca], axis = 'index')
    
    # sort values
    normal = df.loc[df.index.str.contains('.N$')]
    normal = normal.sort_values(by=["Patient_ID"])
    tumor = df.loc[~ df.index.str.contains('.N$')]
    tumor = tumor.sort_values(by=["Patient_ID"])
    all_df = tumor.append(normal)
    

In [432]:
print(ca)
all_df
#list(all_df.index)  

HNSCC


Name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,FAM240B,EXOC3L2,PRR33,SCO2,AC073111.4,EEF1AKMT4,AL022312.1,AL034430.2,ASDURF,DERPC
Database_ID,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.12,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000283329.1,ENSG00000283632.3,ENSG00000283787.1,ENSG00000284194.3,ENSG00000284691.1,ENSG00000284753.2,ENSG00000285025.1,ENSG00000285723.1,ENSG00000286053.1,ENSG00000286140.1
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3L-00977,-1.038676,-1.191310,-0.236151,-0.119728,,0.562327,-0.296474,0.115275,-0.341003,,...,,0.031330,-0.527499,0.139437,,-0.101236,0.262066,0.688601,0.003045,0.717008
C3L-00987,-0.859038,0.851237,0.131398,-0.143763,0.025350,-0.843948,-0.016147,-0.032252,0.091237,0.749156,...,,0.285005,1.801195,0.532061,,0.952802,0.430636,-0.369369,0.191575,0.015299
C3L-00994,-1.296198,-1.246083,0.097081,-0.092767,,-0.245362,0.449282,0.173419,-0.691466,,...,,0.129753,0.589475,-0.091374,,0.095604,0.375271,-0.046737,0.066991,0.050263
C3L-00994-C,-0.698106,-1.074461,-0.461819,-0.160771,,-1.770536,-0.107098,-0.329098,0.079607,0.508735,...,,-0.398202,-1.393080,0.415328,0.209424,,0.663738,0.927971,0.299959,0.482594
C3L-00994-N-duplicate,-0.458009,0.055461,-0.705832,0.068799,,-1.464715,0.325005,-0.325097,0.017057,0.836997,...,,0.178032,2.788117,0.047914,0.348988,,0.551746,0.297794,-0.335488,-0.235401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04275.N,0.574210,-0.123628,0.147308,0.180953,,-0.567524,-0.116171,-0.069249,0.036342,,...,,-0.446276,-0.742642,0.575482,,-0.203983,0.069983,-0.112641,-0.042212,0.210809
C3N-04276.N,0.573894,,-0.217963,-0.095268,,-0.682292,1.144142,-0.009421,0.102442,1.573888,...,,,0.830318,0.235727,,,0.096642,-0.309431,0.002880,-0.031026
C3N-04277.N,0.092513,,-0.120978,-0.155441,-0.257195,-0.724162,1.074523,0.157566,-0.103037,0.782618,...,,-0.451213,-1.140965,0.180619,,-1.046561,,-0.227433,-0.202436,-0.036204
C3N-04278.N,0.591309,,0.159032,-0.010106,,-1.325191,0.438809,-0.188365,-0.272117,0.151065,...,-1.862321,-0.187576,-1.903051,0.684701,,,0.104432,-0.116321,,0.035647


In [379]:
gbm_dif = ['RefInt_01Pool', 'RefInt_02Pool', 'RefInt_03Pool', 'RefInt_04Pool',
       'RefInt_05Pool', 'RefInt_06Pool', 'RefInt_07Pool', 'RefInt_08Pool',
       'RefInt_09Pool', 'RefInt_10Pool', 'RefInt_11Pool']

hnscc_dif = ['128C', 'QC2', 'QC4', '129N', 'LungTumor1', 'Pooled-sample14',
       'LungTumor2', 'QC6', 'LungTumor3', 'Pooled-sample17', 'QC7',
       'Pooled-sample19', 'QC9', 'RefInt_pool01', 'RefInt_pool02',
       'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
       'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
       'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
       'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
       'RefInt_pool19', 'RefInt_pool20'] # has duplicates ("C3L-00994-N-duplicate") and -C ("C3L-00994-C")

luad_dif = ['TumorOnlyIR01', 'NormalOnlyIR02', 'TumorOnlyIR03', 'NormalOnlyIR04',
       'CPT0148080004.1','NormalOnlyIR', 'TumorOnlyIR14',
       'TaiwaneseIR19', 'TumorOnlyIR21', 'TaiwaneseIR22', 'CPT0146580004.1',
       'NormalOnlyIR25', 'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03',
       'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07',
       'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11',
       'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15',
       'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18', 'RefInt_pool19',
       'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22', 'RefInt_pool23',
       'RefInt_pool24', 'RefInt_pool25']
#'2f2e5477-42a4-4906-a943-bf7', '5a84eae1-197e-4463-ad65-59b', '93e30fd5-e57e-4503-a175-863',
#9f905736-f662-41d6-b3ac-167'

lscc_dif = ['LUAD-Global-CR-pool1', 'LSCC-Tumor-ONLY-CR', 'JHU-HNSCC-CR',
       'LUAD-Global-CR-pool-1', 'LSCC-Tumor-ONLY-CR.1', 'JHU-HNSCC-CR.1',
       'LUAD-Global-CR-pool-2', 'JHU-HNSCC-CR.2', 'RefInt_LSCC-Global-CR',
       'RefInt_LSCC-Global-CR.1', 'RefInt_LSCC-Global-CR.2',
       'RefInt_LSCC-Global-CR.3', 'RefInt_LSCC-Global-CR.4',
       'RefInt_LSCC-Global-CR.5', 'RefInt_LSCC-Global-CR.6',
       'RefInt_LSCC-Global-CR.7', 'RefInt_LSCC-Global-CR.8',
       'RefInt_LSCC-Global-CR.9', 'RefInt_LSCC-Global-CR.10',
       'RefInt_LSCC-Global-CR.11', 'RefInt_LSCC-Global-CR.12',
       'RefInt_LSCC-Global-CR.13', 'RefInt_LSCC-Global-CR.14',
       'RefInt_LSCC-Global-CR.15', 'RefInt_LSCC-Global-CR.16',
       'RefInt_LSCC-Global-CR.17', 'RefInt_LSCC-Global-CR.18',
       'RefInt_LSCC-Global-CR.19', 'RefInt_LSCC-Global-CR.20',
       'RefInt_LSCC-Global-CR.21']

br_dif = ['RetroIR', 'CPT0018460005', 'CPT0008140004', 'RetroIR.1',
       'RefInt_Pool01', 'RefInt_Pool02', 'RefInt_Pool03', 'RefInt_Pool04',
       'RefInt_Pool05', 'RefInt_Pool06', 'RefInt_Pool07', 'RefInt_Pool08',
       'RefInt_Pool09', 'RefInt_Pool10', 'RefInt_Pool11', 'RefInt_Pool12',
       'RefInt_Pool13', 'RefInt_Pool14', 'RefInt_Pool15', 'RefInt_Pool16',
       'RefInt_Pool17']

ec_dif = ['NX1', 'NX2', 'NX3', 'NX4', 'NX5', 'NX6', 'NX7', 'NX8', 'NX9', 'NX12',
       'NX17', 'NX13', 'NX14', 'NX10', 'NX16', 'NX18', 'NX11', 'NX15',
       'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03', 'RefInt_pool04',
       'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07', 'RefInt_pool08',
       'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11', 'RefInt_pool12',
       'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15', 'RefInt_pool16',
       'RefInt_pool17']

co_dif = ['colonRef22-2', 'RefInt_ColonRef01', 'RefInt_ColonRef02',
       'RefInt_ColonRef03', 'RefInt_ColonRef04', 'RefInt_ColonRef05',
       'RefInt_ColonRef06', 'RefInt_ColonRef07', 'RefInt_ColonRef08',
       'RefInt_ColonRef09', 'RefInt_ColonRef10', 'RefInt_ColonRef11',
       'RefInt_ColonRef12', 'RefInt_ColonRef13', 'RefInt_ColonRef14',
       'RefInt_ColonRef15', 'RefInt_ColonRef16', 'RefInt_ColonRef17',
       'RefInt_ColonRef18', 'RefInt_ColonRef19', 'RefInt_ColonRef20',
       'RefInt_ColonRef21', 'RefInt_ColonRef22-1']
ov_dif = ['JHU-QC', 'JHU-QC.1', 'JHU-QC.2', 'JHU-QC.3', 'JHU-QC.4',
       'RefInt_PNNL-JHU-Ref-1', 'RefInt_PNNL-JHU-Ref-2',
       'RefInt_PNNL-JHU-Ref-3', 'RefInt_PNNL-JHU-Ref-4',
       'RefInt_PNNL-JHU-Ref-5', 'RefInt_PNNL-JHU-Ref-6',
       'RefInt_PNNL-JHU-Ref-7', 'RefInt_PNNL-JHU-Ref-8',
       'RefInt_PNNL-JHU-Ref-9', 'RefInt_PNNL-JHU-Ref-10',
       'RefInt_PNNL-JHU-Ref-11', 'RefInt_PNNL-JHU-Ref-12']

hcc_dif = df.index[-33:] #(last 33)

ccrcc_dif = ['NCI7-1', 'QC1', 'QC2', 'QC3', 'NCI7-2', 'NCI7-3', 'QC4', 'NCI7-4',
       'NCI7-5', 'QC5', 'QC6', 'QC7', 'QC8', 'RefInt_pool01', 'RefInt_pool02',
       'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
       'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
       'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
       'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
       'RefInt_pool19', 'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22',
       'RefInt_pool23']

In [147]:
a = df.Patient_ID[df.Patient_ID.str.contains('CPT')]
print(ca)
print('Num of aliquot IDs that didnt map:', len(a))
list(a)

ccRCC
Num of aliquot IDs that didnt map: 0


[]

In [134]:
df.columns[df.columns.duplicated()]

Index([], dtype='object', name='Name')

In [135]:
df[df.columns[df.columns.str.contains('HSPA14')]]

Name,HSPA14-201
Patient_ID,Unnamed: 1_level_1
CPT0204410003,-0.361447
CPT0206670004,0.141333
CPT0089150003,0.140285
CPT0207030003,-0.00394528
CPT0190240004,-0.168603
...,...
RefInt_07Pool,0.118854
RefInt_08Pool,0.491971
RefInt_09Pool,-0.299196
RefInt_10Pool,-1.03101


In [87]:
df.apply(lambda x: x - x[0])

Proteins,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,BX276092.9,EXOC3L2,HSPA14,SCO2,AC073111.4,EEF1AKMT4,AL022312.1,AL034430.2,ASDURF,DERPC
ReferenceIntensity,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11BR047,0.535075,,0.217075,-0.187385,-0.750757,-0.5773,0.351452,-0.0345761,-0.373106,-0.427952,...,,0.383079,0.208105,-0.91557,-0.165403,-0.839767,-0.293004,-0.427757,0.0692465,-0.604685
11BR043,-1.34483,,-0.170901,0.161888,-0.0275996,-0.106093,-0.00657508,-0.953963,-0.170225,-0.0453135,...,,-0.481647,-0.00317177,0.288232,0.511197,0.174767,0.0581879,0.854134,0.251461,0.375694
11BR049,-0.522723,,0.159291,0.134471,0.450864,-0.21719,-0.0903709,-0.0547697,-0.0090229,0.0252724,...,,-0.512862,0.300357,-0.221562,-0.176503,-0.249309,-0.875836,-0.110516,0.134251,0.0152991
11BR023,-0.372009,,-0.599995,-0.675929,0.551381,0.096102,-0.141114,-0.342601,-0.0147516,0.00283796,...,,-0.692607,0.731301,-0.573085,0.0888565,0.0765025,-0.318681,0.587145,0.469055,0.011444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RefInt_Pool13,0.199324,,0.236029,1.6212,,,-0.923242,-0.0716306,-0.209051,-0.303849,...,,0.985352,,-0.119028,,,-1.04456,-2.29468,-0.298844,-0.313032
RefInt_Pool14,-1.16274,,-0.838634,-2.0985,,,1.38507,-0.481399,-0.397669,-1.49793,...,,-1.98226,,-1.24684,,,,-1.29706,,-0.760521
RefInt_Pool15,,,-0.135811,-2.91037,,-1.43839,1.41795,-0.134708,-0.22627,-0.312162,...,,,,-1.3704,,,-4.37477,0.961095,-0.271564,-0.242452
RefInt_Pool16,0.14194,,-0.17831,-1.09126,2.10639,0.855015,-1.1822,-0.844684,0.332908,0.81211,...,,-0.128845,2.78704,-0.616552,1.56413,-0.931867,0.0812339,0.940649,0.408005,-0.582739


CO

In [3]:
file_name = "S037_COAD_observed_0920.tsv"
file_path = "../../../input/S037_COAD_observed_0920.tsv"

In [40]:
file_name = "S037_COAD_imputed_0920.tsv"
file_path = "../../../input/S037_COAD_imputed_0920.tsv"

In [66]:
def load_prot():
    if file_name == "S037_COAD_observed_0920.tsv":
        df = pd.read_csv(file_path, sep = "\t", na_values = 'NA') # na_vals 'NA' already default for read_csv
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        '''
        # Append ".N" to the patient IDs of normal samples
        df.index = df.index.where(
                    cond=(~df.index.str.contains('N')),
                    other=df.index + ".N")
        df.index = df.index.str.replace(r'^[NC]', '', regex=True)'''
        dt.reformat_normal_patient_ids(data_dict, existing_identifier=None, existing_identifier_location=None):
        #df = average_replicates(df)
        df = df.sort_values(by=["Patient_ID"])
        
        
    if file_name == "S037_COAD_imputed_0920.tsv":
        print('imputed')
        df = pd.read_csv(file_path, sep = "\t") 
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        # Append ".N" to the patient IDs of normal samples
        df.index = df.index.where(
                    cond=(~df.index.str.contains('N')),
                    other=df.index + ".N")
        df.index = df.index.str.replace(r'^[NC]', '', regex=True)
        #df = average_replicates(df)
        df = df.sort_values(by=["Patient_ID"])
    #self._data["somatic_mutation"] = df # Maps dataframe name to dataframe   
    return df 
        

BR

In [4]:
file_name = "S039_BCprospective_observed_0920.tsv"
file_path = "../../../input/S039_BCprospective_observed_0920.tsv"

In [70]:
file_name = "S039_BCprospective_imputed_0920.tsv"
file_path = "../../../input/S039_BCprospective_imputed_0920.tsv"

In [5]:
def load_prot():
    if file_name == "S039_BCprospective_observed_0920.tsv":
        df = pd.read_csv(file_path, sep = "\t", na_values = 'NA') # na_vals 'NA' already default for read_csv
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        df = average_replicates(df)
        df = df.sort_values(by=["Patient_ID"])
        
        
    if file_name == "S039_BCprospective_imputed_0920.tsv":
        print('imputed')
        df = pd.read_csv(file_path, sep = "\t") 
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        df = average_replicates(df)
        df = df.sort_values(by=["Patient_ID"])
    #self._data["somatic_mutation"] = df # Maps dataframe name to dataframe   
    return df 
        

CCRCC

In [44]:
file_name = 'S044_CCRCC_observed_0920.tsv'
file_path = "../../../input/S044_CCRCC_observed_0920.tsv"

In [49]:
file_name = 'S044_CCRCC_imputed_0920.tsv'
file_path = "../../../input/S044_CCRCC_imputed_0920.tsv"

In [50]:
def load_prot():
    if file_name == "S044_CCRCC_observed_0920.tsv":
        df = pd.read_csv(file_path, sep = "\t", na_values = 'NA') # na_vals 'NA' already default for read_csv
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        df = df.sort_values(by=["Patient_ID"])
        
        
    if file_name == "S044_CCRCC_imputed_0920.tsv":
        df = pd.read_csv(file_path, sep = "\t") 
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        df = df.sort_values(by=["Patient_ID"])
    #self._data["somatic_mutation"] = df # Maps dataframe name to dataframe   
    return df 
        

In [68]:
df = load_prot()
len(df.index)
df

Name,A1BG,A1CF,A2M,A2ML1,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZSCAN29,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C01CO005,24.629943,22.917624,27.548108,24.719826,23.030635,23.055448,15.439666,,22.883801,22.369288,...,,,21.895927,23.443447,17.217686,,18.813000,26.512897,23.612940,
C01CO006,24.673760,21.999213,27.696759,24.868843,23.353499,22.858458,,,23.109965,22.458751,...,,,21.709141,23.318620,16.858139,,,26.517464,23.627020,
C01CO008,24.654947,22.401374,26.208268,23.173298,23.455240,22.707250,,,22.824523,22.081327,...,,,21.115609,23.525150,17.569299,,,26.339790,23.858366,
C01CO013,24.174454,22.712058,27.642087,23.468756,23.797973,23.067967,,14.301011,23.010640,22.249563,...,,,,23.658041,16.842165,,,26.082750,23.658261,18.694035
C01CO014,25.216378,22.441844,27.420127,25.151115,23.382811,23.193389,,,23.283321,22.356224,...,,,22.146598,23.617412,17.471794,,18.526062,26.276471,23.682356,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N21CO006,24.949547,22.980890,28.621647,25.077601,23.063305,23.070571,,,22.816231,22.442979,...,,,22.292356,23.219651,16.978031,,18.843028,26.845417,24.041718,
N21CO007,26.496402,22.380037,28.509560,25.849991,22.910929,22.738721,,,22.580697,22.677316,...,,,22.517807,23.263240,16.835348,,19.262815,28.514419,23.696605,
N22CO004,25.714305,22.839871,28.946658,25.251690,23.045450,23.046727,,17.454300,22.858693,22.405387,...,,,22.295576,23.198930,16.417946,,19.190131,27.058971,24.046540,
N22CO006,26.022231,21.818372,27.721179,25.182235,23.100097,22.177184,,,22.549597,22.764859,...,,,,23.194878,,,,28.118556,23.704297,


In [67]:
# Append ".N" to the patient IDs of normal samples
df.index = df.index.where(
            cond=(~df.index.str.contains('N')),
            other=df.index + ".N")
df.index = df.index.str.replace(r'^[NC]', '', regex=True)
df

Name,A1BG,A1CF,A2M,A2ML1,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZSCAN29,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01CO001.N.N,26.279454,21.596329,28.688227,26.214133,22.860268,22.307815,,,22.792996,22.635313,...,,,21.928481,23.178321,,,19.325204,28.527022,23.868817,
01CO005,24.629943,22.917624,27.548108,24.719826,23.030635,23.055448,15.439666,,22.883801,22.369288,...,,,21.895927,23.443447,17.217686,,18.813000,26.512897,23.612940,
01CO005.N.N,25.995046,23.140455,28.750963,25.525662,23.114175,22.961740,,,22.847687,22.372091,...,,,22.090648,23.234877,17.132385,,,26.288567,24.258361,
01CO006,24.673760,21.999213,27.696759,24.868843,23.353499,22.858458,,,23.109965,22.458751,...,,,21.709141,23.318620,16.858139,,,26.517464,23.627020,
01CO006.N.N,26.570903,22.385760,29.130111,25.998532,23.009533,22.236936,,,22.246516,22.507736,...,,,23.025203,23.020001,16.745852,,18.920495,27.690757,23.665908,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22CO006,26.053254,21.467717,27.998671,25.156273,23.500076,22.612589,,,22.665959,22.679167,...,,,21.696700,23.327120,17.303239,,,27.168228,23.412640,
22CO006.N.N,26.022231,21.818372,27.721179,25.182235,23.100097,22.177184,,,22.549597,22.764859,...,,,,23.194878,,,,28.118556,23.704297,
27CO004,24.619892,22.542480,27.543346,25.482953,23.348516,22.795207,,14.741214,22.972287,22.620274,...,,,,23.393558,17.006675,,,26.870429,23.614195,18.181941
27CO004.N.N,26.418830,22.222579,28.350512,26.417675,23.243703,22.313681,16.542370,,22.433907,22.436058,...,,,23.184281,23.128937,,,18.777770,27.461645,23.832992,


In [51]:
repl = lambda m: print(m.group(0))
pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)

foo
bar
baz


0     123
1        
2     NaN
dtype: object

In [None]:
# if has 

In [47]:
# check nan is float
df.dtypes.unique()

array([dtype('float64')], dtype=object)

In [14]:
import cptac as c
col = c.Colon()

                                          

In [37]:
clin = col.get_clinical()
clin.Sample_Tumor_Normal.value_counts()
clin

Name,Sample_Tumor_Normal,Age,CEA,Gender,Lymphatic_Invasion,Mucinous,Perineural_Invasion,Polyps_History,Polyps_Present,Stage,Subsite,Synchronous_Tumors,Tumor.Status,Vascular_Invasion,Vital.Status,pathalogy_N_stage,pathalogy_T_stage
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
01CO001,Tumor,729.0,4.9,Male,Yes,Mucinous,No,No,No,Stage III,Sigmoid Colon,No,Tumor free,Yes,Living,N2b,T4a
01CO005,Tumor,838.0,1.0,Female,No,Not Mucinous,No,Yes,Yes,Stage II,Sigmoid Colon,No,Tumor free,No,Deceased,N0,T3
01CO006,Tumor,904.0,,Female,Yes,Mucinous,Yes,No,No,Stage III,Ascending Colon,Yes,With tumor,Yes,Living,N2b,T4a
01CO008,Tumor,652.0,,Female,No,Mucinous,No,No,Yes,Stage II,Descending Colon,No,With tumor,No,Living,N0,T3
01CO013,Tumor,695.0,,Male,No,Not Mucinous,No,Yes,No,Stage I,Sigmoid Colon,No,Tumor free,No,Living,N0,T2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21CO006.N,Normal,,,,,,,,,,,,,,,,
21CO007.N,Normal,,,,,,,,,,,,,,,,
22CO004.N,Normal,,,,,,,,,,,,,,,,
22CO006.N,Normal,,,,,,,,,,,,,,,,


In [52]:
df.index.value_counts()

CPT0078530001    1
CPT0019990003    1
QC7              1
CPT0086820003    1
CPT0026410003    1
                ..
CPT0089480003    1
CPT0012290003    1
CPT0001220008    1
CPT0086870003    1
CPT0001230001    1
Name: Patient_ID, Length: 207, dtype: int64

In [34]:

df.loc[df.index[~df.index.str.contains('N')]]

Name,A1BG,A1CF,A2M,A2ML1,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZSCAN29,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C01CO005,24.629943,22.917624,27.548108,24.719826,23.030635,23.055448,15.439666,,22.883801,22.369288,...,,,21.895927,23.443447,17.217686,,18.813000,26.512897,23.612940,
C01CO006,24.673760,21.999213,27.696759,24.868843,23.353499,22.858458,,,23.109965,22.458751,...,,,21.709141,23.318620,16.858139,,,26.517464,23.627020,
C01CO008,24.654947,22.401374,26.208268,23.173298,23.455240,22.707250,,,22.824523,22.081327,...,,,21.115609,23.525150,17.569299,,,26.339790,23.858366,
C01CO013,24.174454,22.712058,27.642087,23.468756,23.797973,23.067967,,14.301011,23.010640,22.249563,...,,,,23.658041,16.842165,,,26.082750,23.658261,18.694035
C01CO014,25.216378,22.441844,27.420127,25.151115,23.382811,23.193389,,,23.283321,22.356224,...,,,22.146598,23.617412,17.471794,,18.526062,26.276471,23.682356,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C21CO007,24.803531,23.005107,26.969597,24.411148,23.534149,22.798532,,,22.636499,22.497522,...,15.100107,,21.698427,23.478344,,,,26.410791,23.482844,
C22CO004,24.603826,22.767064,27.236375,,23.619021,23.650281,,,23.067310,22.295823,...,,,19.640944,23.449360,17.197994,14.41988,18.428365,26.351963,23.841696,
C22CO006,26.053254,21.467717,27.998671,25.156273,23.500076,22.612589,,,22.665959,22.679167,...,,,21.696700,23.327120,17.303239,,,27.168228,23.412640,
C27CO004,24.619892,22.542480,27.543346,25.482953,23.348516,22.795207,,14.741214,22.972287,22.620274,...,,,,23.393558,17.006675,,,26.870429,23.614195,18.181941


In [20]:
df2 = df[df.index.str.contains('#')]

#id_list contains only patient_IDs of replicates (without #s)
pid = pd.Series(df2.index)
ids = pid.replace('#\d', '', regex=True)
id_list = list(set(ids))
len(id_list)

25

In [24]:
for patient_ID in id_list:
    id_df = df[df.index.str.contains(patient_ID)]
    vals = list(id_df.mean(axis=0))
    df.loc[patient_ID] = vals
len(df.index)

160

In [58]:
df3 = df[~ df.index.str.contains('#')] # drop replicate cols
df3

Name,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001,29.119839,29.969561,23.911997,,24.241829,25.792271,,20.885407,23.867641,24.601799,...,15.939507,25.125830,22.080416,21.144878,16.800087,21.428527,22.021599,28.465878,24.101387,21.414583
01BR008,27.981792,29.463668,26.300232,,24.261375,25.121084,,,24.095237,24.420127,...,,25.347255,22.461650,21.558227,,21.111841,21.737231,28.004736,24.738737,21.676707
01BR009,28.621923,30.569770,26.808487,,24.000083,25.549725,,,23.929608,24.368006,...,,25.304699,21.924364,21.038085,,20.470053,21.991507,28.174315,24.671163,21.392982
01BR010,27.960852,28.818767,23.609111,,24.198715,27.424636,15.474977,23.165785,23.955289,24.497232,...,,25.811141,21.301399,20.757563,15.852175,19.881175,21.476538,28.695744,24.697041,21.488790
01BR015,28.667862,29.174431,24.006172,,24.480487,26.279127,,19.954898,24.177881,24.418099,...,15.982669,25.119131,21.840894,21.672479,16.374077,19.009123,21.845964,28.827827,24.293987,21.885117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11BR015,28.689033,29.251387,24.177059,,24.236137,26.513135,,17.602614,24.077406,24.175180,...,,24.933165,22.733512,20.946895,,20.540080,22.566275,28.119561,25.210009,21.319818
11BR030,28.334103,29.830102,24.684080,,24.003498,26.806280,,19.291722,24.986137,24.788362,...,17.022319,25.503920,22.510956,21.028470,16.303599,20.747265,21.915059,28.458876,25.054101,21.142319
11BR047,28.557662,28.930395,24.250058,,24.083496,26.668538,,,24.842353,24.889333,...,,25.203263,21.747280,20.896745,,19.961190,21.853275,28.545581,24.829232,21.411066
11BR011,27.987600,29.052691,23.398758,,23.814596,26.377069,12.863634,19.453466,24.989873,24.675188,...,17.688854,25.194212,21.742422,20.868857,16.245341,20.116137,21.722472,27.908621,24.803295,21.053509


In [None]:
test = id_list[3]
df[df.index.str.contains(test)]

In [None]:
#id_list contains only unique patient_IDs without #s
pid = pd.Series(df.index)
ids = pid.replace('#\d', '', regex=True)
id_list = list(set(ids))
len(id_list)

In [None]:
new_df = pd.DataFrame()

for patient_ID in id_list:
    df = df[df.index.str.contains(patient_ID)]
    vals = list(df.mean(axis=0))
    df.loc[patient_ID] = vals
    new_df = new_df.append(one_id)

In [None]:
new_df

In [None]:
def average_replicates(index_name, df, identifier_to_drop):
    index_name = re.sub(identifier_to_drop, '', index_name)
    df = df[df.index.str.contains(index_name)]
    vals = list(df.mean(axis=0))
    df.loc[index_name] = vals
    return df

In [None]:
#id_list contains only unique patient_IDs without #s
pid = pd.Series(test_df.index)
ids = pid.replace('#\d', '', regex=True)
id_list = list(set(ids))
len(id_list)

In [None]:
new_df = pd.DataFrame()

for patient_ID in id_list:
    one_id = average_replicates(patient_ID, test_df, '#\d')
    new_df = new_df.append(one_id)

In [None]:
df2 = average_replicates('11BR011', df)
df3 = df[~ df.index.str.contains('#')] # drop replicate cols
df3

In [None]:
# dup, average 
test = {1:[1,2,3], 2:[3,4,5]}
df = pd.DataFrame(test, index = ['a#1','c','b'])
df

In [None]:
n = df[df.index.str.contains('#')]
len(n)
n

In [None]:
pid = pd.Series(n.index)
ids = pid.replace('#\d', '', regex=True)
ids_list = list(ids)
ids_list # list of all Patient_IDs with # (in form without #)

# check if replicates also have a duplicate without a #
print(df.index[df.index.isin(ids_list)])
print('None')

In [None]:
df.index.duplicated().any()

In [None]:
test = df.index.to_series().replace('#\d', '', regex=True)

In [None]:
test