In [1]:
import pandas as pd
import numpy as np
import cptac
import cptac.dataframe_tools as dt

This file maps aliquot Ids (Ex: CPT0001580009) to sample IDs (Ex: C3L-00001-02) which are known to be tumor or normal samples. Then sample IDs are mapped to case IDs (Ex: C3L-00001) which we call Patient_ID.

# Get mapper for ids

Get dictionary with aliquot IDs as keys and sample IDs as values.

In [2]:
mapper_1 = "pdc_aliquot_2021-03-02_15_58.tsv.xlsx" 
m_path = "../../../input/"+mapper_1

In [3]:
map_df = pd.read_excel(m_path, sep = "\t", na_values = 'NA') 

In [4]:
# Use samples.submitter_id#1 because it always has a value and represents the type of sample for any run in the aliquot 
# (each row is either tumor or normal in pdc_aliquot_2021-03-02_15_58.tsv) 

map_df = map_df.rename(columns = {'submitter_id':'aliquot_ID', 'samples.submitter_id#1':'sample_ID',
                       'samples.submitter_id#2':'2', 'samples.submitter_id#3':'3',
                       'samples.submitter_id#4':'4', 'samples.submitter_id#5':'5'})
map_df_1 = map_df[['aliquot_ID','sample_ID','2','3','4','5']] # cols 1-5 are same id with "-##" which maps to tumor/normal
map_df_1

Unnamed: 0,aliquot_ID,sample_ID,2,3,4,5
0,CPT0053040004,9f905736-f662-41d6-b3ac-16758d,,,,
1,CPT0052940004,93e30fd5-e57e-4503-a175-863c7d,,,,
2,CPT0052170004,5a84eae1-197e-4463-ad65-59becc,,,,
3,CPT0051690004,2f2e5477-42a4-4906-a943-bf7f80,,,,
4,CPT0001580009,C3L-00001-02,,,,
...,...,...,...,...,...,...
3070,CPT032238 0003,C3N-05923-01,,,,
3071,CPT032239 0003,C3N-05923-09,,,,
3072,CPT032253 0003,C3N-05929-03,,,,
3073,CPT032254 0003,C3N-05929-05,,,,


In [5]:
# if want all sample.submitters

t = map_df[['aliquot_ID','2', '3','4', '5']]


tt = map_df.drop(columns = ['type','project_id','analyte_type']) # only for protein, need to get files for phospho
a = tt.melt(id_vars = 'aliquot_ID', value_name = 'sample_ID')
b = a.loc[a.sample_ID != ' ']
b = b[['aliquot_ID','sample_ID']]
b

Unnamed: 0,aliquot_ID,sample_ID
0,CPT0053040004,9f905736-f662-41d6-b3ac-16758d
1,CPT0052940004,93e30fd5-e57e-4503-a175-863c7d
2,CPT0052170004,5a84eae1-197e-4463-ad65-59becc
3,CPT0051690004,2f2e5477-42a4-4906-a943-bf7f80
4,CPT0001580009,C3L-00001-02
...,...,...
10498,CPT026563 0004,C3L-04090-04
11352,CPT011701 0004,C3N-01871-04
13296,CPT0192540004,C3L-02665-05
13429,CPT0190360004,C3L-03407-05


In [27]:
aliquot_to_sample = {}
for i, row in map_df_1.iterrows():
    aliquot_to_sample[row['aliquot_ID']] = row[1]

Get df using sample IDs as the index to merge with.

In [28]:
mapper_2 = 'pdc_sample_2021-03-05_16_43.tsv.txt'
m_path_2 = "../../../input/"+mapper_2

In [29]:
map_df_2 = pd.read_csv(m_path_2, sep = "\t", na_values = 'NA') # type, project_id, and analyte_type have all same vals

# Make patient_ID vals (case_id + .N if normal sample) 
map_df_2['patient_ID'] = map_df_2['cases.submitter_id'] +'_'+ map_df_2['tissue_type']
map_df_2['patient_ID'] = map_df_2['patient_ID'].str.replace('_Tumor$','')
map_df_2['patient_ID'] = map_df_2['patient_ID'].str.replace('_Normal$','.N')

In [30]:
# Merge dfs
# set index to sample_id
map_df_1 = map_df_1.set_index('sample_ID')
map_df_2 = map_df_2.set_index('submitter_id')
all_df = map_df_1.join(map_df_2, how = 'outer')

In [32]:
all_df = all_df[['aliquot_ID', 'patient_ID']] # keep aliquot (in proteomics) and newly created patient_id
all_df

Unnamed: 0,aliquot_ID,patient_ID
2f2e5477-42a4-4906-a943-bf7f80,CPT0051690004,11LU035
5a84eae1-197e-4463-ad65-59becc,CPT0052170004,11LU022
93e30fd5-e57e-4503-a175-863c7d,CPT0052940004,11LU016
9f905736-f662-41d6-b3ac-16758d,CPT0053040004,11LU013
C3L-00001-02,CPT0001580009,C3L-00001
...,...,...
C3N-05923-01,CPT032238 0003,C3N-05923
C3N-05923-09,CPT032239 0003,C3N-05923.N
C3N-05929-03,CPT032253 0003,C3N-05929
C3N-05929-05,CPT032254 0003,C3N-05929.N


In [33]:
# dictionary with aliquot_ID keys and patient_ID valuess
matched_ids = {}
for i, row in all_df.iterrows():
    matched_ids[row['aliquot_ID']] = row[-1]

Add GBM normal samples to matched_ids dictionary.

In [34]:
# GBM normal samples
gbm_file = 'GBM_normal_sample_mapping.xlsx'
g_path = "../../../input/"+gbm_file

In [35]:
gbm_df = pd.read_excel(g_path, sep = "\t", na_values = 'NA') 

In [36]:
for i, row in gbm_df.iterrows():
    matched_ids[row['BSIID']] = row[0]

# Example of using the matched_ids dictionary.

In [39]:
class SliceableDict(dict):
    default = None
    def __getitem__(self, key):
        if isinstance(key, list): 
            # omits key if it does not exist
            return {k: self[k] for k in key if k in self}
        return dict.get(self, key)

In [40]:
file_name = "LSCC_Report_abundance_groupby=gene_protNorm=MD_gu=2.tsv" 
file_path = "../../../proteomics/"+file_name

In [42]:
df = pd.read_csv(file_path, sep = "\t") 
df = df.drop(columns = ['MaxPepProb', 'NumberPSM']) #index is protein identifier (duplicate)
df.Proteins = df.Proteins.apply(lambda x: x.split('|')[6]) # Get gene name from position in list of gene identifiers
df = df.rename(columns = {'Proteins':'Name', 'Index':'Database_ID'})
df = df.set_index(['Name', 'Database_ID']) # set multiindex
df = df.transpose()
ref_intensities = df.loc["ReferenceIntensity"] # Get reference intensities to use to calculate ratios 
df = df.subtract(ref_intensities, axis="columns") # Subtract reference intensities from all the values, to get ratios
df = df.iloc[1:,:] # drop ReferenceIntensity row 
df.index.name = 'Patient_ID'

# Match ids to get Patient_ID
# Create cancer specific dict
indices = list(df.index)
sliced = SliceableDict(matched_ids) # initiate class SliceableDict
rn_list = sliced[indices] # get smaller dict of aliquot_IDs of specific cancer (to save time)

#replace with cancer specific dictionary 
df = df.reset_index()
df = df.replace(rn_list) 
df = df.set_index('Patient_ID')

# sort values
normal = df.loc[df.index.str.contains('.N$')]
normal = normal.sort_values(by=["Patient_ID"])
tumor = df.loc[~ df.index.str.contains('.N$')]
tumor = tumor.sort_values(by=["Patient_ID"])
all_df = tumor.append(normal)


In [43]:
all_df

Name,TSPAN6,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,NIPAL3,...,C2orf81,SMIM39,AC073111.4,EEF1AKMT4,CCDC39,AL022312.1,H2BE1,AL034430.2,ASDURF,DERPC
Database_ID,ENSG00000000003.15,ENSG00000000419.12,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,ENSG00000001461.17,...,ENSG00000284308.1,ENSG00000284479.1,ENSG00000284691.1,ENSG00000284753.2,ENSG00000284862.3,ENSG00000285025.1,ENSG00000285480.1,ENSG00000285723.1,ENSG00000286053.1,ENSG00000286140.1
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C3L-00081,0.079518,-0.001039,-0.402590,0.132341,-1.085208,-1.089783,-1.067586,1.291952,-0.051636,-0.561498,...,,,-0.102676,0.373738,-0.605665,,,0.301907,0.185180,0.448790
C3L-00415,1.807494,-0.235698,-0.239740,0.034215,1.075563,-1.084766,0.606739,0.406536,-0.839830,-0.126418,...,,,,1.019424,1.281988,,,0.210129,0.299780,0.045820
C3L-00445,0.399779,0.153815,0.118345,0.742296,-0.938783,-0.316595,-0.120830,-0.401105,0.074109,-0.353499,...,,,-0.230684,0.762956,,,0.622607,0.703846,0.138316,0.038762
C3L-00568,0.288530,0.282189,0.033540,0.180126,0.014461,-0.418013,-0.212112,1.012072,-0.009210,,...,,,-0.191840,0.443221,,-0.006237,,0.302608,-0.113514,-0.193325
C3L-00603,0.092916,0.198139,-0.104727,0.073420,-1.029234,-0.767940,-0.290321,-0.777878,0.614973,-0.252299,...,,,-0.792325,-0.242456,-0.859595,0.471690,0.790296,-0.310493,0.452679,0.016756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03886.N,-0.591170,0.028150,-0.096358,,0.197189,0.971960,0.078514,-0.654055,0.153947,-0.010380,...,,,0.293390,-0.644190,0.356784,,,-0.305767,,0.019605
C3N-04124.N,-0.139427,0.119659,-0.088212,,0.143292,-0.107922,0.101382,-0.528720,-0.327322,0.209013,...,,,,-1.733177,0.519701,-0.495794,-0.352495,-0.213174,-0.441643,0.216280
C3N-04127.N,-0.310264,-0.139648,-0.226243,,0.217656,0.516384,0.092889,-1.030549,0.018584,-0.400037,...,,0.536276,0.060130,,0.673507,,,-0.849927,-0.179950,0.183629
C3N-04155.N,-0.497260,-0.224988,-0.129715,-0.430732,0.493728,0.825081,0.442165,-0.347043,-0.229325,,...,,,-0.039072,-0.373005,,0.051713,,-0.930856,-0.102371,0.333944
