# Load Proteomics (prot names)

In [1]:
import pandas as pd
import numpy as np
import warnings
from functools import reduce
import re

In [2]:
import cptac
import cptac.dataframe_tools as dt

In [4]:
# average replicates
def average_replicates(df):
    replicate_df = df[df.index.str.contains('.')]
    patient_ids = pd.Series(replicate_df.index)
    ids = patient_ids.replace('.\d', '', regex=True)
    id_list = list(set(ids)) #id_list contains only patient_IDs of replicates (without #s)

    for patient_ID in id_list:
        id_df = df[df.index.str.contains(patient_ID)] # slice out replicates for a single patient
        vals = list(id_df.mean(axis=0)) 
        df.loc[patient_ID] = vals # add new row to original df with averages of replicates 

    df = df[~ df.index.str.contains('.')] # drop unaveraged replicate cols (averaged rows are kept)
    return df

# Proteomics (Genecode) parser

In [29]:
gbm_dif = ['RefInt_01Pool', 'RefInt_02Pool', 'RefInt_03Pool', 'RefInt_04Pool',
       'RefInt_05Pool', 'RefInt_06Pool', 'RefInt_07Pool', 'RefInt_08Pool',
       'RefInt_09Pool', 'RefInt_10Pool', 'RefInt_11Pool']

hnscc_dif = ['128C', 'QC2', 'QC3', 'QC4', '129N', 'LungTumor1', 'Pooled-sample14',
       'LungTumor2', 'QC6', 'LungTumor3', 'Pooled-sample17', 'QC7',
       'Pooled-sample19', 'QC9', 'RefInt_pool01', 'RefInt_pool02',
       'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
       'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
       'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
       'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
       'RefInt_pool19', 'RefInt_pool20'] # has -C ("C3L-00994-C")

luad_dif = ['TumorOnlyIR01', 'NormalOnlyIR02', 'TumorOnlyIR03', 'NormalOnlyIR04',
       'CPT0148080004.1','NormalOnlyIR', 'TumorOnlyIR14',
       'TaiwaneseIR19', 'TumorOnlyIR21', 'TaiwaneseIR22', 'CPT0146580004.1',
       'NormalOnlyIR25', 'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03',
       'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07',
       'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11',
       'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15',
       'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18', 'RefInt_pool19',
       'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22', 'RefInt_pool23',
       'RefInt_pool24', 'RefInt_pool25']

lscc_dif = ['LUAD-Global-CR-pool1', 'LSCC-Tumor-ONLY-CR', 'JHU-HNSCC-CR',
       'LUAD-Global-CR-pool-1', 'LSCC-Tumor-ONLY-CR.1', 'JHU-HNSCC-CR.1',
       'LUAD-Global-CR-pool-2', 'JHU-HNSCC-CR.2', 'RefInt_LSCC-Global-CR',
       'RefInt_LSCC-Global-CR.1', 'RefInt_LSCC-Global-CR.2',
       'RefInt_LSCC-Global-CR.3', 'RefInt_LSCC-Global-CR.4',
       'RefInt_LSCC-Global-CR.5', 'RefInt_LSCC-Global-CR.6',
       'RefInt_LSCC-Global-CR.7', 'RefInt_LSCC-Global-CR.8',
       'RefInt_LSCC-Global-CR.9', 'RefInt_LSCC-Global-CR.10',
       'RefInt_LSCC-Global-CR.11', 'RefInt_LSCC-Global-CR.12',
       'RefInt_LSCC-Global-CR.13', 'RefInt_LSCC-Global-CR.14',
       'RefInt_LSCC-Global-CR.15', 'RefInt_LSCC-Global-CR.16',
       'RefInt_LSCC-Global-CR.17', 'RefInt_LSCC-Global-CR.18',
       'RefInt_LSCC-Global-CR.19', 'RefInt_LSCC-Global-CR.20',
       'RefInt_LSCC-Global-CR.21']

br_dif = ['RetroIR', 'CPT0018460005', 'CPT0008140004', 'RetroIR.1',
       'RefInt_Pool01', 'RefInt_Pool02', 'RefInt_Pool03', 'RefInt_Pool04',
       'RefInt_Pool05', 'RefInt_Pool06', 'RefInt_Pool07', 'RefInt_Pool08',
       'RefInt_Pool09', 'RefInt_Pool10', 'RefInt_Pool11', 'RefInt_Pool12',
       'RefInt_Pool13', 'RefInt_Pool14', 'RefInt_Pool15', 'RefInt_Pool16',
       'RefInt_Pool17']

ec_dif = ['NX1', 'NX2', 'NX3', 'NX4', 'NX5', 'NX6', 'NX7', 'NX8', 'NX9', 'NX12',
       'NX17', 'NX13', 'NX14', 'NX10', 'NX16', 'NX18', 'NX11', 'NX15',
       'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03', 'RefInt_pool04',
       'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07', 'RefInt_pool08',
       'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11', 'RefInt_pool12',
       'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15', 'RefInt_pool16',
       'RefInt_pool17']

co_dif = ['colonRef22-2', 'RefInt_ColonRef01', 'RefInt_ColonRef02',
       'RefInt_ColonRef03', 'RefInt_ColonRef04', 'RefInt_ColonRef05',
       'RefInt_ColonRef06', 'RefInt_ColonRef07', 'RefInt_ColonRef08',
       'RefInt_ColonRef09', 'RefInt_ColonRef10', 'RefInt_ColonRef11',
       'RefInt_ColonRef12', 'RefInt_ColonRef13', 'RefInt_ColonRef14',
       'RefInt_ColonRef15', 'RefInt_ColonRef16', 'RefInt_ColonRef17',
       'RefInt_ColonRef18', 'RefInt_ColonRef19', 'RefInt_ColonRef20',
       'RefInt_ColonRef21', 'RefInt_ColonRef22-1']
ov_dif = ['JHU-QC', 'JHU-QC.1', 'JHU-QC.2', 'JHU-QC.3', 'JHU-QC.4',
       'RefInt_PNNL-JHU-Ref-1', 'RefInt_PNNL-JHU-Ref-2',
       'RefInt_PNNL-JHU-Ref-3', 'RefInt_PNNL-JHU-Ref-4',
       'RefInt_PNNL-JHU-Ref-5', 'RefInt_PNNL-JHU-Ref-6',
       'RefInt_PNNL-JHU-Ref-7', 'RefInt_PNNL-JHU-Ref-8',
       'RefInt_PNNL-JHU-Ref-9', 'RefInt_PNNL-JHU-Ref-10',
       'RefInt_PNNL-JHU-Ref-11', 'RefInt_PNNL-JHU-Ref-12']

#hcc_dif = df.index[-33:] #(last 33)

ccrcc_dif = ['NCI7-1', 'QC1', 'QC2', 'QC3', 'NCI7-2', 'NCI7-3', 'QC4', 'NCI7-4',
       'NCI7-5', 'QC5', 'QC6', 'QC7', 'QC8', 'RefInt_pool01', 'RefInt_pool02',
       'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06',
       'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10',
       'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14',
       'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18',
       'RefInt_pool19', 'RefInt_pool20', 'RefInt_pool21', 'RefInt_pool22',
       'RefInt_pool23']

In [30]:
drop_cols = {'GBM':gbm_dif,'HNSCC':hnscc_dif, 'LSCC':lscc_dif,'LUAD':luad_dif,
             'BR':br_dif,'EC':ec_dif,'CO':co_dif,'OV':ov_dif,'ccRCC':ccrcc_dif} #add HCC, 'HNSCC':hnscc_dif,

In [31]:
cancer_names = ['GBM','LSCC','LUAD','BR','EC','CO','OV','ccRCC'] # HCC (not in drop-cols), HNSCC
#ca = cancer_names[0]
ca = 'HNSCC'
file_name = ca+"_Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv" 
file_path = "../../../proteomics/prot_names/"+file_name

# Get df that maps aliquot_IDs to Patient_IDs
mapping_df = pd.read_csv('../../../input/aliquot_to_patient_ID.tsv', delimiter = '\t', index_col = 0)

In [57]:
if file_name == file_name:
    df = pd.read_csv(file_path, sep = "\t") # CHECK na_vals 'NA' already default for read_csv
    df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) #index is protein identifier (duplicate)
    df.Index = df.Index.apply(lambda x: x.split('|')[5]) # Get gene name from position in list of gene identifiers
    df = df.rename(columns = {'Index':'Proteins', 'Gene':'Database_ID'})
    df = df.set_index(['Proteins', 'Database_ID']) # set multiindex
    df = df.transpose()
    ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
    df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
    df = df.iloc[1:,:] # drop ReferenceIntensity row 
    df.index.name = 'Patient_ID'
    
    
    # Get Patient_IDs
    # slice mapping_df to include cancer specific aliquot_IDs 
    index_list = list(df.index)
    cancer_df = mapping_df.loc[mapping_df['aliquot_ID'].isin(index_list)]
    
    # Create dictionary with aliquot_ID as keys and patient_ID as values
    matched_ids = {}
    for i, row in cancer_df.iterrows():
        matched_ids[row['aliquot_ID']] = row['patient_ID']

    df = df.reset_index()
    df = df.replace(matched_ids) # replace aliquot_IDs with Patient_IDs
    df = df.set_index('Patient_ID')
    
    # If mapping is not necessary, tumor and normal are specified with -T or -N
    #df.index = df.index.str.replace('-T$','')
    #df.index = df.index.str.replace('-N$','.N')
    
    # Drop qauality control and ref intensity cols
    df = df.drop(drop_cols[ca], axis = 'index')
    
    # Sort values
    if ca == 'GBM':
        normal = df.loc[df.index.str.contains('^PT-')]
        normal = normal.sort_values(by=["Patient_ID"])
        tumor = df.loc[~ df.index.str.contains('^PT-')]
        tumor = tumor.sort_values(by=["Patient_ID"])
    else:
        normal = df.loc[df.index.str.contains('.N$')]
        normal = normal.sort_values(by=["Patient_ID"])
        tumor = df.loc[~ df.index.str.contains('.N$')]
        tumor = tumor.sort_values(by=["Patient_ID"])
    all_df = tumor.append(normal)

    

In [70]:
# average replicates
def average_replicates(df, cancer, omics_type= 'prot'):
    if omics_type == 'prot':
        dup_identifier = {'HNSCC':'-duplicate\d?'}
    replicate_df = df[df.index.str.contains(dup_identifier[cancer])]
    patient_ids = pd.Series(replicate_df.index)
    ids = patient_ids.replace(dup_identifier[cancer], '', regex=True)
    
    id_list = list(set(ids)) #id_list contains only patient_IDs of replicates (without #s)
    print(id_list)

    for patient_ID in id_list:
        id_df = df[df.index.str.contains(patient_ID)] # slice out replicates for a single patient
        vals = list(id_df.mean(axis=0)) 
        df.loc[patient_ID] = vals # replace row of index in original df with averages of replicates 

    new_df = df[~ df.index.str.contains(dup_identifier[cancer])] # drop unaveraged replicate cols (averaged rows are kept)
    return new_df

In [71]:
t = average_replicates(all_df, 'HNSCC')

['C3L-02617-N', 'C3L-00994-N', 'C3L-02617-T']


In [66]:
i = 'C3L-02617-T'
t.loc[[i]]

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,HS3ST1-201,SEMA3F-201,CFTR-201,CYP51A1-201,...,BTD-216,TNK2-222,ETNK1-207,MYO6-218,MPZ-208,EED-213,DDHD1-211,ZBTB3-204,WIZ-211,RFX7-204
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,ENSG00000001630.17,...,ENSG00000169814.16,ENSG00000061938.19,ENSG00000139163.16,ENSG00000196586.16,ENSG00000158887.18,ENSG00000074266.21,ENSG00000100523.16,ENSG00000185670.9,ENSG00000011451.21,ENSG00000181827.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3L-02617-T,0.065139,0.43202,-0.155147,0.443621,0.130909,-0.335009,0.283397,-0.379606,,-0.343839,...,-0.606679,-0.564172,0.016241,0.696018,-3.408587,0.608888,0.141493,,0.38858,-0.185216


In [61]:
all_df.loc[all_df.index.str.contains(i)]

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,HS3ST1-201,SEMA3F-201,CFTR-201,CYP51A1-201,...,BTD-216,TNK2-222,ETNK1-207,MYO6-218,MPZ-208,EED-213,DDHD1-211,ZBTB3-204,WIZ-211,RFX7-204
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,ENSG00000001630.17,...,ENSG00000169814.16,ENSG00000061938.19,ENSG00000139163.16,ENSG00000196586.16,ENSG00000158887.18,ENSG00000074266.21,ENSG00000100523.16,ENSG00000185670.9,ENSG00000011451.21,ENSG00000181827.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3L-02617-C,0.118072,-0.740248,0.14358,0.194698,-0.054436,-0.317078,,0.84298,,-0.694189,...,0.421275,-0.031932,-0.609972,0.298702,0.339297,-0.501359,-0.038101,,0.246113,-0.497083
C3L-02617-N-duplicate,0.488057,-0.026414,0.244815,-0.168083,-0.023441,0.599488,,0.710336,,0.145823,...,0.705619,-0.263757,0.221609,1.378241,0.4861,-0.38455,-0.195894,,-0.229596,-0.132535
C3L-02617-N-duplicate2,0.323258,0.298737,0.043573,0.651542,0.360623,0.174971,,-0.184014,,-0.109994,...,-0.695894,0.006175,0.596314,-0.686279,-2.202057,0.856671,0.026905,,0.462417,0.049286
C3L-02617-T,0.144595,0.310266,-0.083823,0.485506,0.180542,-0.132502,0.174613,-0.376189,,-0.256262,...,-0.69076,-0.564172,0.220322,0.534818,-2.964524,0.651728,0.317106,,0.367816,-0.185216
C3L-02617-T-duplicate,0.038653,0.472604,-0.178921,0.429659,0.114365,-0.402511,0.319659,-0.380745,,-0.373032,...,-0.578653,-0.564172,-0.051787,0.749752,-3.556608,0.594608,0.082956,,0.395501,
C3L-02617-N,0.455392,0.064951,0.277133,0.06919,0.079219,0.48023,1.141664,0.187654,,0.048889,...,0.222896,0.02195,0.091349,0.349869,-0.402991,0.010821,-0.373351,,-0.027829,-0.041624


In [41]:
all_df[all_df.index.str.contains('-duplicate\d?')]

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,HS3ST1-201,SEMA3F-201,CFTR-201,CYP51A1-201,...,BTD-216,TNK2-222,ETNK1-207,MYO6-218,MPZ-208,EED-213,DDHD1-211,ZBTB3-204,WIZ-211,RFX7-204
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,ENSG00000001630.17,...,ENSG00000169814.16,ENSG00000061938.19,ENSG00000139163.16,ENSG00000196586.16,ENSG00000158887.18,ENSG00000074266.21,ENSG00000100523.16,ENSG00000185670.9,ENSG00000011451.21,ENSG00000181827.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3L-00994-N-duplicate,0.021122,-0.469606,0.470709,-0.109351,0.671424,-0.329171,,0.545515,,-0.515633,...,1.051994,0.235913,-0.482369,-0.291613,0.407172,-0.403676,-0.359891,,-0.277898,0.309567
C3L-02617-N-duplicate,0.488057,-0.026414,0.244815,-0.168083,-0.023441,0.599488,,0.710336,,0.145823,...,0.705619,-0.263757,0.221609,1.378241,0.4861,-0.38455,-0.195894,,-0.229596,-0.132535
C3L-02617-N-duplicate2,0.323258,0.298737,0.043573,0.651542,0.360623,0.174971,,-0.184014,,-0.109994,...,-0.695894,0.006175,0.596314,-0.686279,-2.202057,0.856671,0.026905,,0.462417,0.049286
C3L-02617-T-duplicate,0.038653,0.472604,-0.178921,0.429659,0.114365,-0.402511,0.319659,-0.380745,,-0.373032,...,-0.578653,-0.564172,-0.051787,0.749752,-3.556608,0.594608,0.082956,,0.395501,
L-T-duplicate,,,,,,,,,,,...,,,,,,,,,,
L-N-duplicate,,,,,,,,,,,...,,,,,,,,,,


In [33]:
print(ca)
all_df.index.to_list()
#all_df.loc[~ all_df.index.str.contains('^C\d')]

HNSCC


['C3L-00977',
 'C3L-00987',
 'C3L-00994',
 'C3L-00994-C',
 'C3L-00994-N-duplicate',
 'C3L-00995',
 'C3L-00997',
 'C3L-00999',
 'C3L-01138',
 'C3L-01237',
 'C3L-02617',
 'C3L-02617-C',
 'C3L-02617-N-duplicate',
 'C3L-02617-N-duplicate2',
 'C3L-02617-T-duplicate',
 'C3L-02621',
 'C3L-02651',
 'C3L-03378',
 'C3L-04025',
 'C3L-04350-C',
 'C3L-04354',
 'C3L-04791',
 'C3L-04844',
 'C3L-04849',
 'C3L-05257-C',
 'C3N-00204',
 'C3N-00295',
 'C3N-00297',
 'C3N-00299',
 'C3N-00306',
 'C3N-00307',
 'C3N-00498',
 'C3N-00519',
 'C3N-00822',
 'C3N-00825',
 'C3N-00828',
 'C3N-00829',
 'C3N-00846',
 'C3N-00857',
 'C3N-00871',
 'C3N-01337',
 'C3N-01338',
 'C3N-01339',
 'C3N-01340',
 'C3N-01620',
 'C3N-01643',
 'C3N-01645',
 'C3N-01752',
 'C3N-01754',
 'C3N-01755',
 'C3N-01756',
 'C3N-01757',
 'C3N-01757-C',
 'C3N-01758',
 'C3N-01858',
 'C3N-01859',
 'C3N-01943',
 'C3N-01944',
 'C3N-01945',
 'C3N-01946',
 'C3N-01947',
 'C3N-01948',
 'C3N-02275',
 'C3N-02279',
 'C3N-02333',
 'C3N-02693',
 'C3N-02694',
 'C

In [None]:
dup = {'HNSCC': '-duplicate'}

In [96]:
replicate_df = all_df[all_df.index.str.contains('\.\d')]
replicate_df

Proteins,ARF5-201,M6PR-201,ESRRA-201,FKBP4-201,NDUFAF7-201,FUCA2-201,HS3ST1-201,SEMA3F-201,CFTR-201,CYP51A1-201,...,BTD-216,TNK2-222,ETNK1-207,MYO6-218,MPZ-208,EED-213,DDHD1-211,ZBTB3-204,WIZ-211,RFX7-204
Database_ID,ENSG00000004059.11,ENSG00000003056.8,ENSG00000173153.16,ENSG00000004478.8,ENSG00000003509.16,ENSG00000001036.14,ENSG00000002587.10,ENSG00000001617.12,ENSG00000001626.16,ENSG00000001630.17,...,ENSG00000169814.16,ENSG00000061938.19,ENSG00000139163.16,ENSG00000196586.16,ENSG00000158887.18,ENSG00000074266.21,ENSG00000100523.16,ENSG00000185670.9,ENSG00000011451.21,ENSG00000181827.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2


In [42]:
print(ca, len(all_df.index))
list(all_df.index)

BR 149


['01BR001',
 '01BR008',
 '01BR009',
 '01BR010',
 '01BR015',
 '01BR017',
 '01BR018',
 '01BR020',
 '01BR023',
 '01BR025',
 '01BR026',
 '01BR027',
 '01BR027.1',
 '01BR030',
 '01BR031',
 '01BR032',
 '01BR033',
 '01BR040',
 '01BR042',
 '01BR043',
 '03BR002',
 '03BR004',
 '03BR005',
 '03BR006',
 '03BR010',
 '03BR011',
 '03BR013',
 '05BR001',
 '05BR003',
 '05BR004',
 '05BR005',
 '05BR009',
 '05BR016',
 '05BR026',
 '05BR029',
 '05BR029.1',
 '05BR038',
 '05BR042',
 '05BR043',
 '05BR044',
 '05BR045',
 '06BR003',
 '06BR005',
 '06BR006',
 '06BR014',
 '09BR001',
 '09BR004',
 '09BR005',
 '09BR007',
 '11BR003',
 '11BR004',
 '11BR006',
 '11BR006.1',
 '11BR009',
 '11BR010',
 '11BR011',
 '11BR011.1',
 '11BR012',
 '11BR013',
 '11BR014',
 '11BR015',
 '11BR015.1',
 '11BR016',
 '11BR017',
 '11BR017.1',
 '11BR018',
 '11BR019',
 '11BR020',
 '11BR020.1',
 '11BR022',
 '11BR023',
 '11BR023.1',
 '11BR024',
 '11BR024.1',
 '11BR025',
 '11BR025.1',
 '11BR027',
 '11BR028',
 '11BR028.1',
 '11BR030',
 '11BR030.1',
 '11

In [None]:
# FIX gbm one value didn't map (CPT0206330003)

In [147]:
a = df.Patient_ID[df.Patient_ID.str.contains('CPT')]
print(ca)
print('Num of aliquot IDs that didnt map:', len(a))
list(a)

ccRCC
Num of aliquot IDs that didnt map: 0


[]

In [134]:
df.columns[df.columns.duplicated()]

Index([], dtype='object', name='Name')

In [135]:
df[df.columns[df.columns.str.contains('HSPA14')]]

Name,HSPA14-201
Patient_ID,Unnamed: 1_level_1
CPT0204410003,-0.361447
CPT0206670004,0.141333
CPT0089150003,0.140285
CPT0207030003,-0.00394528
CPT0190240004,-0.168603
...,...
RefInt_07Pool,0.118854
RefInt_08Pool,0.491971
RefInt_09Pool,-0.299196
RefInt_10Pool,-1.03101


In [87]:
df.apply(lambda x: x - x[0])

Proteins,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,BX276092.9,EXOC3L2,HSPA14,SCO2,AC073111.4,EEF1AKMT4,AL022312.1,AL034430.2,ASDURF,DERPC
ReferenceIntensity,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11BR047,0.535075,,0.217075,-0.187385,-0.750757,-0.5773,0.351452,-0.0345761,-0.373106,-0.427952,...,,0.383079,0.208105,-0.91557,-0.165403,-0.839767,-0.293004,-0.427757,0.0692465,-0.604685
11BR043,-1.34483,,-0.170901,0.161888,-0.0275996,-0.106093,-0.00657508,-0.953963,-0.170225,-0.0453135,...,,-0.481647,-0.00317177,0.288232,0.511197,0.174767,0.0581879,0.854134,0.251461,0.375694
11BR049,-0.522723,,0.159291,0.134471,0.450864,-0.21719,-0.0903709,-0.0547697,-0.0090229,0.0252724,...,,-0.512862,0.300357,-0.221562,-0.176503,-0.249309,-0.875836,-0.110516,0.134251,0.0152991
11BR023,-0.372009,,-0.599995,-0.675929,0.551381,0.096102,-0.141114,-0.342601,-0.0147516,0.00283796,...,,-0.692607,0.731301,-0.573085,0.0888565,0.0765025,-0.318681,0.587145,0.469055,0.011444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RefInt_Pool13,0.199324,,0.236029,1.6212,,,-0.923242,-0.0716306,-0.209051,-0.303849,...,,0.985352,,-0.119028,,,-1.04456,-2.29468,-0.298844,-0.313032
RefInt_Pool14,-1.16274,,-0.838634,-2.0985,,,1.38507,-0.481399,-0.397669,-1.49793,...,,-1.98226,,-1.24684,,,,-1.29706,,-0.760521
RefInt_Pool15,,,-0.135811,-2.91037,,-1.43839,1.41795,-0.134708,-0.22627,-0.312162,...,,,,-1.3704,,,-4.37477,0.961095,-0.271564,-0.242452
RefInt_Pool16,0.14194,,-0.17831,-1.09126,2.10639,0.855015,-1.1822,-0.844684,0.332908,0.81211,...,,-0.128845,2.78704,-0.616552,1.56413,-0.931867,0.0812339,0.940649,0.408005,-0.582739


CO

In [3]:
file_name = "S037_COAD_observed_0920.tsv"
file_path = "../../../input/S037_COAD_observed_0920.tsv"

In [40]:
file_name = "S037_COAD_imputed_0920.tsv"
file_path = "../../../input/S037_COAD_imputed_0920.tsv"

In [66]:
def load_prot():
    if file_name == "S037_COAD_observed_0920.tsv":
        df = pd.read_csv(file_path, sep = "\t", na_values = 'NA') # na_vals 'NA' already default for read_csv
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        '''
        # Append ".N" to the patient IDs of normal samples
        df.index = df.index.where(
                    cond=(~df.index.str.contains('N')),
                    other=df.index + ".N")
        df.index = df.index.str.replace(r'^[NC]', '', regex=True)'''
        dt.reformat_normal_patient_ids(data_dict, existing_identifier=None, existing_identifier_location=None):
        #df = average_replicates(df)
        df = df.sort_values(by=["Patient_ID"])
        
        
    if file_name == "S037_COAD_imputed_0920.tsv":
        print('imputed')
        df = pd.read_csv(file_path, sep = "\t") 
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        # Append ".N" to the patient IDs of normal samples
        df.index = df.index.where(
                    cond=(~df.index.str.contains('N')),
                    other=df.index + ".N")
        df.index = df.index.str.replace(r'^[NC]', '', regex=True)
        #df = average_replicates(df)
        df = df.sort_values(by=["Patient_ID"])
    #self._data["somatic_mutation"] = df # Maps dataframe name to dataframe   
    return df 
        

BR

In [4]:
file_name = "S039_BCprospective_observed_0920.tsv"
file_path = "../../../input/S039_BCprospective_observed_0920.tsv"

In [70]:
file_name = "S039_BCprospective_imputed_0920.tsv"
file_path = "../../../input/S039_BCprospective_imputed_0920.tsv"

In [5]:
def load_prot():
    if file_name == "S039_BCprospective_observed_0920.tsv":
        df = pd.read_csv(file_path, sep = "\t", na_values = 'NA') # na_vals 'NA' already default for read_csv
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        df = average_replicates(df)
        df = df.sort_values(by=["Patient_ID"])
        
        
    if file_name == "S039_BCprospective_imputed_0920.tsv":
        print('imputed')
        df = pd.read_csv(file_path, sep = "\t") 
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        df = average_replicates(df)
        df = df.sort_values(by=["Patient_ID"])
    #self._data["somatic_mutation"] = df # Maps dataframe name to dataframe   
    return df 
        

CCRCC

In [44]:
file_name = 'S044_CCRCC_observed_0920.tsv'
file_path = "../../../input/S044_CCRCC_observed_0920.tsv"

In [49]:
file_name = 'S044_CCRCC_imputed_0920.tsv'
file_path = "../../../input/S044_CCRCC_imputed_0920.tsv"

In [50]:
def load_prot():
    if file_name == "S044_CCRCC_observed_0920.tsv":
        df = pd.read_csv(file_path, sep = "\t", na_values = 'NA') # na_vals 'NA' already default for read_csv
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        df = df.sort_values(by=["Patient_ID"])
        
        
    if file_name == "S044_CCRCC_imputed_0920.tsv":
        df = pd.read_csv(file_path, sep = "\t") 
        df = df.transpose()
        df.index.name = 'Patient_ID'
        df.columns.name = 'Name'
        df = df.sort_values(by=["Patient_ID"])
    #self._data["somatic_mutation"] = df # Maps dataframe name to dataframe   
    return df 
        

In [68]:
df = load_prot()
len(df.index)
df

Name,A1BG,A1CF,A2M,A2ML1,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZSCAN29,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C01CO005,24.629943,22.917624,27.548108,24.719826,23.030635,23.055448,15.439666,,22.883801,22.369288,...,,,21.895927,23.443447,17.217686,,18.813000,26.512897,23.612940,
C01CO006,24.673760,21.999213,27.696759,24.868843,23.353499,22.858458,,,23.109965,22.458751,...,,,21.709141,23.318620,16.858139,,,26.517464,23.627020,
C01CO008,24.654947,22.401374,26.208268,23.173298,23.455240,22.707250,,,22.824523,22.081327,...,,,21.115609,23.525150,17.569299,,,26.339790,23.858366,
C01CO013,24.174454,22.712058,27.642087,23.468756,23.797973,23.067967,,14.301011,23.010640,22.249563,...,,,,23.658041,16.842165,,,26.082750,23.658261,18.694035
C01CO014,25.216378,22.441844,27.420127,25.151115,23.382811,23.193389,,,23.283321,22.356224,...,,,22.146598,23.617412,17.471794,,18.526062,26.276471,23.682356,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N21CO006,24.949547,22.980890,28.621647,25.077601,23.063305,23.070571,,,22.816231,22.442979,...,,,22.292356,23.219651,16.978031,,18.843028,26.845417,24.041718,
N21CO007,26.496402,22.380037,28.509560,25.849991,22.910929,22.738721,,,22.580697,22.677316,...,,,22.517807,23.263240,16.835348,,19.262815,28.514419,23.696605,
N22CO004,25.714305,22.839871,28.946658,25.251690,23.045450,23.046727,,17.454300,22.858693,22.405387,...,,,22.295576,23.198930,16.417946,,19.190131,27.058971,24.046540,
N22CO006,26.022231,21.818372,27.721179,25.182235,23.100097,22.177184,,,22.549597,22.764859,...,,,,23.194878,,,,28.118556,23.704297,


In [67]:
# Append ".N" to the patient IDs of normal samples
df.index = df.index.where(
            cond=(~df.index.str.contains('N')),
            other=df.index + ".N")
df.index = df.index.str.replace(r'^[NC]', '', regex=True)
df

Name,A1BG,A1CF,A2M,A2ML1,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZSCAN29,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01CO001.N.N,26.279454,21.596329,28.688227,26.214133,22.860268,22.307815,,,22.792996,22.635313,...,,,21.928481,23.178321,,,19.325204,28.527022,23.868817,
01CO005,24.629943,22.917624,27.548108,24.719826,23.030635,23.055448,15.439666,,22.883801,22.369288,...,,,21.895927,23.443447,17.217686,,18.813000,26.512897,23.612940,
01CO005.N.N,25.995046,23.140455,28.750963,25.525662,23.114175,22.961740,,,22.847687,22.372091,...,,,22.090648,23.234877,17.132385,,,26.288567,24.258361,
01CO006,24.673760,21.999213,27.696759,24.868843,23.353499,22.858458,,,23.109965,22.458751,...,,,21.709141,23.318620,16.858139,,,26.517464,23.627020,
01CO006.N.N,26.570903,22.385760,29.130111,25.998532,23.009533,22.236936,,,22.246516,22.507736,...,,,23.025203,23.020001,16.745852,,18.920495,27.690757,23.665908,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22CO006,26.053254,21.467717,27.998671,25.156273,23.500076,22.612589,,,22.665959,22.679167,...,,,21.696700,23.327120,17.303239,,,27.168228,23.412640,
22CO006.N.N,26.022231,21.818372,27.721179,25.182235,23.100097,22.177184,,,22.549597,22.764859,...,,,,23.194878,,,,28.118556,23.704297,
27CO004,24.619892,22.542480,27.543346,25.482953,23.348516,22.795207,,14.741214,22.972287,22.620274,...,,,,23.393558,17.006675,,,26.870429,23.614195,18.181941
27CO004.N.N,26.418830,22.222579,28.350512,26.417675,23.243703,22.313681,16.542370,,22.433907,22.436058,...,,,23.184281,23.128937,,,18.777770,27.461645,23.832992,


In [51]:
repl = lambda m: print(m.group(0))
pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)

foo
bar
baz


0     123
1        
2     NaN
dtype: object

In [None]:
# if has 

In [47]:
# check nan is float
df.dtypes.unique()

array([dtype('float64')], dtype=object)

In [14]:
import cptac as c
col = c.Colon()

                                          

In [37]:
clin = col.get_clinical()
clin.Sample_Tumor_Normal.value_counts()
clin

Name,Sample_Tumor_Normal,Age,CEA,Gender,Lymphatic_Invasion,Mucinous,Perineural_Invasion,Polyps_History,Polyps_Present,Stage,Subsite,Synchronous_Tumors,Tumor.Status,Vascular_Invasion,Vital.Status,pathalogy_N_stage,pathalogy_T_stage
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
01CO001,Tumor,729.0,4.9,Male,Yes,Mucinous,No,No,No,Stage III,Sigmoid Colon,No,Tumor free,Yes,Living,N2b,T4a
01CO005,Tumor,838.0,1.0,Female,No,Not Mucinous,No,Yes,Yes,Stage II,Sigmoid Colon,No,Tumor free,No,Deceased,N0,T3
01CO006,Tumor,904.0,,Female,Yes,Mucinous,Yes,No,No,Stage III,Ascending Colon,Yes,With tumor,Yes,Living,N2b,T4a
01CO008,Tumor,652.0,,Female,No,Mucinous,No,No,Yes,Stage II,Descending Colon,No,With tumor,No,Living,N0,T3
01CO013,Tumor,695.0,,Male,No,Not Mucinous,No,Yes,No,Stage I,Sigmoid Colon,No,Tumor free,No,Living,N0,T2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21CO006.N,Normal,,,,,,,,,,,,,,,,
21CO007.N,Normal,,,,,,,,,,,,,,,,
22CO004.N,Normal,,,,,,,,,,,,,,,,
22CO006.N,Normal,,,,,,,,,,,,,,,,


In [52]:
df.index.value_counts()

CPT0078530001    1
CPT0019990003    1
QC7              1
CPT0086820003    1
CPT0026410003    1
                ..
CPT0089480003    1
CPT0012290003    1
CPT0001220008    1
CPT0086870003    1
CPT0001230001    1
Name: Patient_ID, Length: 207, dtype: int64

In [34]:

df.loc[df.index[~df.index.str.contains('N')]]

Name,A1BG,A1CF,A2M,A2ML1,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZSCAN29,ZSCAN32,ZSWIM8,ZW10,ZWILCH,ZWINT,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C01CO005,24.629943,22.917624,27.548108,24.719826,23.030635,23.055448,15.439666,,22.883801,22.369288,...,,,21.895927,23.443447,17.217686,,18.813000,26.512897,23.612940,
C01CO006,24.673760,21.999213,27.696759,24.868843,23.353499,22.858458,,,23.109965,22.458751,...,,,21.709141,23.318620,16.858139,,,26.517464,23.627020,
C01CO008,24.654947,22.401374,26.208268,23.173298,23.455240,22.707250,,,22.824523,22.081327,...,,,21.115609,23.525150,17.569299,,,26.339790,23.858366,
C01CO013,24.174454,22.712058,27.642087,23.468756,23.797973,23.067967,,14.301011,23.010640,22.249563,...,,,,23.658041,16.842165,,,26.082750,23.658261,18.694035
C01CO014,25.216378,22.441844,27.420127,25.151115,23.382811,23.193389,,,23.283321,22.356224,...,,,22.146598,23.617412,17.471794,,18.526062,26.276471,23.682356,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C21CO007,24.803531,23.005107,26.969597,24.411148,23.534149,22.798532,,,22.636499,22.497522,...,15.100107,,21.698427,23.478344,,,,26.410791,23.482844,
C22CO004,24.603826,22.767064,27.236375,,23.619021,23.650281,,,23.067310,22.295823,...,,,19.640944,23.449360,17.197994,14.41988,18.428365,26.351963,23.841696,
C22CO006,26.053254,21.467717,27.998671,25.156273,23.500076,22.612589,,,22.665959,22.679167,...,,,21.696700,23.327120,17.303239,,,27.168228,23.412640,
C27CO004,24.619892,22.542480,27.543346,25.482953,23.348516,22.795207,,14.741214,22.972287,22.620274,...,,,,23.393558,17.006675,,,26.870429,23.614195,18.181941


In [20]:
df2 = df[df.index.str.contains('#')]

#id_list contains only patient_IDs of replicates (without #s)
pid = pd.Series(df2.index)
ids = pid.replace('#\d', '', regex=True)
id_list = list(set(ids))
len(id_list)

25

In [24]:
for patient_ID in id_list:
    id_df = df[df.index.str.contains(patient_ID)]
    vals = list(id_df.mean(axis=0))
    df.loc[patient_ID] = vals
len(df.index)

160

In [58]:
df3 = df[~ df.index.str.contains('#')] # drop replicate cols
df3

Name,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADAT,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001,29.119839,29.969561,23.911997,,24.241829,25.792271,,20.885407,23.867641,24.601799,...,15.939507,25.125830,22.080416,21.144878,16.800087,21.428527,22.021599,28.465878,24.101387,21.414583
01BR008,27.981792,29.463668,26.300232,,24.261375,25.121084,,,24.095237,24.420127,...,,25.347255,22.461650,21.558227,,21.111841,21.737231,28.004736,24.738737,21.676707
01BR009,28.621923,30.569770,26.808487,,24.000083,25.549725,,,23.929608,24.368006,...,,25.304699,21.924364,21.038085,,20.470053,21.991507,28.174315,24.671163,21.392982
01BR010,27.960852,28.818767,23.609111,,24.198715,27.424636,15.474977,23.165785,23.955289,24.497232,...,,25.811141,21.301399,20.757563,15.852175,19.881175,21.476538,28.695744,24.697041,21.488790
01BR015,28.667862,29.174431,24.006172,,24.480487,26.279127,,19.954898,24.177881,24.418099,...,15.982669,25.119131,21.840894,21.672479,16.374077,19.009123,21.845964,28.827827,24.293987,21.885117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11BR015,28.689033,29.251387,24.177059,,24.236137,26.513135,,17.602614,24.077406,24.175180,...,,24.933165,22.733512,20.946895,,20.540080,22.566275,28.119561,25.210009,21.319818
11BR030,28.334103,29.830102,24.684080,,24.003498,26.806280,,19.291722,24.986137,24.788362,...,17.022319,25.503920,22.510956,21.028470,16.303599,20.747265,21.915059,28.458876,25.054101,21.142319
11BR047,28.557662,28.930395,24.250058,,24.083496,26.668538,,,24.842353,24.889333,...,,25.203263,21.747280,20.896745,,19.961190,21.853275,28.545581,24.829232,21.411066
11BR011,27.987600,29.052691,23.398758,,23.814596,26.377069,12.863634,19.453466,24.989873,24.675188,...,17.688854,25.194212,21.742422,20.868857,16.245341,20.116137,21.722472,27.908621,24.803295,21.053509


In [None]:
test = id_list[3]
df[df.index.str.contains(test)]

In [None]:
#id_list contains only unique patient_IDs without #s
pid = pd.Series(df.index)
ids = pid.replace('#\d', '', regex=True)
id_list = list(set(ids))
len(id_list)

In [None]:
new_df = pd.DataFrame()

for patient_ID in id_list:
    df = df[df.index.str.contains(patient_ID)]
    vals = list(df.mean(axis=0))
    df.loc[patient_ID] = vals
    new_df = new_df.append(one_id)

In [None]:
new_df

In [None]:
def average_replicates(index_name, df, identifier_to_drop):
    index_name = re.sub(identifier_to_drop, '', index_name)
    df = df[df.index.str.contains(index_name)]
    vals = list(df.mean(axis=0))
    df.loc[index_name] = vals
    return df

In [None]:
#id_list contains only unique patient_IDs without #s
pid = pd.Series(test_df.index)
ids = pid.replace('#\d', '', regex=True)
id_list = list(set(ids))
len(id_list)

In [None]:
new_df = pd.DataFrame()

for patient_ID in id_list:
    one_id = average_replicates(patient_ID, test_df, '#\d')
    new_df = new_df.append(one_id)

In [None]:
df2 = average_replicates('11BR011', df)
df3 = df[~ df.index.str.contains('#')] # drop replicate cols
df3

In [None]:
# dup, average 
test = {1:[1,2,3], 2:[3,4,5]}
df = pd.DataFrame(test, index = ['a#1','c','b'])
df

In [None]:
n = df[df.index.str.contains('#')]
len(n)
n

In [None]:
pid = pd.Series(n.index)
ids = pid.replace('#\d', '', regex=True)
ids_list = list(ids)
ids_list # list of all Patient_IDs with # (in form without #)

# check if replicates also have a duplicate without a #
print(df.index[df.index.isin(ids_list)])
print('None')

In [None]:
df.index.duplicated().any()

In [None]:
test = df.index.to_series().replace('#\d', '', regex=True)

In [None]:
test