In [1]:
import cptac, os
import re, datetime
import pandas as pd 
import numpy as np 
from cptac import utils as ut

In [2]:
get_data_path = lambda folders, fname: os.path.normpath(os.environ['DATA_PATH']+'/'+'/'.join(folders) +'/'+ fname)
get_tables_path = lambda fname: os.path.normpath('../local_data/processed_data/'+ fname)

In [3]:
# BrCa 2020
file_brca_2020_proteomics =  get_data_path(['proteomics_data'], 'CPTAC_BRCA_2018_Proteome_gene_Median.cct')

# OvCa 2020
file_ovca_2020_proteomics =  get_data_path(['proteomics_data'], 'CPTAC_OV_proteome_gene_tumor.cct')

# ccRCC
file_ccrcc_proteomics =  get_data_path(['proteomics_data'], 'CPTAC_CCRCC_proteome_Tumor.cct')

# Luad
file_luad_proteomics =  get_data_path(['proteomics_data'], 'CPTAC_LUAD_proteome_ratio_NArm_TUMOR.cct')

# LSCC
file_lscc_proteomics =  get_data_path(['proteomics_data'], 'CPTAC_LSCC_2020_proteome_ratio_NArm_TUMOR.cct')

# Endometrium
file_endo_proteomics =  get_data_path(['proteomics_data'], 'CPTAC_UCEC_Proteomics_TMT_gene_level_Tumor.cct')

# GBM
file_gbm_proteomics =  get_data_path(['proteomics_data'], 'CPTAC_GBM_proteome_mssm_per_gene.cct')

# HNSCC
file_hnscc_proteomics =  get_data_path(['proteomics_data'], 'CPTAC_HNSCC_Proteomics_TMT_Gene_level_Tumor.cct')

# PDAC 
file_pdac_proteomics =  get_data_path(['tumour_studies','pdac'], 'proteomics_gene_level_MD_abundance_tumor.cct.txt')

# Breast Cancer 
file_brca_2016_proteomics = get_data_path(['tumour_studies','brca', 'mertins_2016'], 'CPTAC_BC_SupplementaryTable03.xlsx')

# Ovarian Cancer 2016 
file_ovca_2016_proteomics = get_data_path(['tumour_studies','ovca', 'zhang_2016'], 'Table_S2.xlsx')

file_protein_coding_genes = get_data_path(['hgnc_22Feb2022'], 'approved_protein_coding_genes.txt')
file_cptac_samples_info = get_tables_path('CPTAC_sample_info.parquet')

#### Proteomics

In [4]:
brca_2020_proteomics = pd.read_csv(file_brca_2020_proteomics, sep='\t', index_col=0).dropna(thresh=1)
print(brca_2020_proteomics.shape)
brca_2020_proteomics[:2]

(12594, 122)


Unnamed: 0_level_0,CPT000814,CPT001846,X01BR001,X01BR008,X01BR009,X01BR010,X01BR015,X01BR017,X01BR018,X01BR020,...,X20BR002,X20BR005,X20BR006,X20BR007,X20BR008,X21BR001,X21BR002,X21BR010,X22BR005,X22BR006
IDX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.6712,1.3964,2.0219,-0.529,1.2556,-0.3843,1.0394,1.1533,1.9579,-0.1637,...,1.8732,-0.4227,1.5862,-0.297,1.6767,-0.661,-1.3735,1.1583,0.4948,0.5049
A2M,-0.2075,1.3302,1.6269,0.3267,3.4489,-1.0239,-0.1915,2.5655,2.4185,-0.581,...,1.5261,-1.911,1.6519,1.3457,1.7907,-0.6402,0.4227,0.3329,-1.0986,-0.6582


In [5]:
ovca_2020_proteomics = pd.read_csv(file_ovca_2020_proteomics, sep='\t', index_col=0).dropna(thresh=1)
print(ovca_2020_proteomics.shape)
ovca_2020_proteomics[:2]

(10078, 83)


Unnamed: 0_level_0,01OV007,01OV017,01OV018,01OV023,01OV026,01OV029,01OV030,01OV039,01OV041,01OV047,...,17OV036,17OV039,17OV040,18OV001,20OV005,26OV002,26OV008,26OV009,26OV011,26OV013
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,0.76512,-0.461906,0.065842,1.512082,0.343872,-0.312811,-0.338799,-0.492472,-0.523344,0.063341,...,1.221625,0.570312,-0.215113,-0.017322,-1.620263,2.028395,-1.723434,0.594745,-0.861639,-1.266903
ACADM,-0.228234,-0.16933,-0.05173,-0.294907,-0.306999,0.109829,-0.684895,-0.322297,-0.540556,-0.418267,...,-0.615414,-0.661785,0.229375,-0.565152,0.373479,0.334208,0.212168,-0.213024,-0.096838,0.301722


In [6]:
ccrcc_proteomics = pd.read_csv(file_ccrcc_proteomics, sep='\t', index_col=0).dropna(thresh=1)
print(ccrcc_proteomics.shape)
ccrcc_proteomics[:2]

(11710, 110)


Unnamed: 0_level_0,C3L-00004,C3L-00010,C3L-00011,C3L-00026,C3L-00079,C3L-00088,C3L-00096,C3L-00097,C3L-00103,C3L-00183,...,C3N-01220,C3N-01261,C3N-01361,C3N-01522,C3N-01524,C3N-01646,C3N-01648,C3N-01649,C3N-01651,C3N-01808
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.304302,1.195915,-0.286155,0.13573,-0.123959,0.427542,-0.242107,0.506469,0.720836,0.082946,...,0.791576,0.31854,0.093607,-0.504522,0.788178,-0.173487,-0.350081,0.246378,-0.242872,0.171883
A1CF,0.641447,0.19462,-0.780455,0.404286,-0.677773,0.310249,-0.128732,-0.513243,-1.135859,-0.128068,...,-0.892166,-0.251923,-0.535844,0.087143,-0.12676,-0.686012,-0.699248,-0.847288,0.48695,0.364511


In [7]:
luad_proteomics = pd.read_csv(file_luad_proteomics, sep='\t', index_col=0).dropna(thresh=1)
luad_proteomics = luad_proteomics.rename(columns = lambda x: x.replace('.', '-'))
print(luad_proteomics.shape)
luad_proteomics[:2]

(10316, 110)


Unnamed: 0_level_0,C3N-01799,C3L-01890,C3N-00572,C3N-02423,C3N-02729,C3L-00263,C3N-01410,C3N-00578,C3N-02587,C3L-00893,...,C3N-00579,C3N-02582,C3L-02350,C3N-00738,C3N-00959,C3N-02003,C3N-00175,C3N-01823,C3L-02549,C3L-02365
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-2.4302,1.1148,-0.6473,-1.8646,-0.7255,-1.1384,-0.4466,-1.2608,-1.13,-1.7255,...,-1.7548,-3.3718,-0.6153,1.1367,-1.2789,-1.6768,1.2371,-0.1132,-1.1933,-2.073
A2M,-3.2133,2.0689,-1.7723,-2.7064,-3.2235,-2.0141,-2.0563,-2.3294,-1.7842,-0.2812,...,-2.1857,-4.1354,-1.0176,0.0345,-1.1394,-3.1343,-0.4172,-2.0648,-2.4646,-2.4108


In [8]:
lscc_proteomics = pd.read_csv(file_lscc_proteomics, sep='\t', index_col=0).dropna(thresh=1)
lscc_proteomics = lscc_proteomics.rename(columns = lambda x: x.replace('.', '-'))
print(lscc_proteomics.shape)
lscc_proteomics[:2]

(11111, 108)


Unnamed: 0_level_0,C3L-02665,C3L-01663,C3N-02575,C3L-02546,C3L-00965,C3L-02963,C3N-04162,C3L-02646,C3N-02285,C3N-03875,...,C3N-03425,C3L-02163,C3N-04124,C3L-00904,C3N-00211,C3N-01892,C3L-00603,C3L-03965,C3N-03851,C3L-02349
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-1.8124,-0.5483,-1.8476,-0.3368,-0.0216,-1.5651,-2.7905,-1.0626,-2.2041,-3.4221,...,-1.5538,-1.4692,-3.9762,-1.8146,-1.6747,-3.2918,-1.7088,-0.3769,-1.6203,-1.4065
A2M,-3.0441,-2.6252,-4.1494,-2.842,-0.7407,-2.3016,-3.7248,-2.5141,-1.3281,-4.5664,...,-2.262,-3.088,-6.3093,-2.2917,-4.8246,-4.5314,-1.6799,-2.8511,-2.9245,-2.6932


In [9]:
endo_proteomics = pd.read_csv(file_endo_proteomics, sep='\t', index_col=0).dropna(thresh=1)
print(endo_proteomics.shape)
endo_proteomics[:2]

(10993, 95)


Unnamed: 0_level_0,C3L-00006,C3L-00008,C3L-00032,C3L-00090,C3L-00098,C3L-00136,C3L-00137,C3L-00139,C3L-00143,C3L-00145,...,C3N-01219,C3N-01267,C3N-01346,C3N-01349,C3N-01510,C3N-01520,C3N-01521,C3N-01537,C3N-01802,C3N-01825
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-1.18,-0.685,-0.528,-1.67,-0.374,-1.08,-1.32,-0.467,-1.12,-0.716,...,-0.295,-1.3,-0.67,0.687,-0.269,-1.07,-1.28,-0.29,0.266,0.692
A2M,-0.863,-1.07,-1.32,-1.19,-0.0206,-0.708,-0.708,0.37,-1.31,-0.885,...,-0.0589,-1.29,-1.11,1.44,0.944,-0.712,-0.736,-0.32,1.39,0.589


In [10]:
gbm_proteomics = pd.read_csv(file_gbm_proteomics, sep='\t', index_col=0).dropna(thresh=1)
print(gbm_proteomics.shape)
gbm_proteomics[:2]

(11141, 99)


Unnamed: 0_level_0,C3L-00104,C3L-00365,C3L-00674,C3L-00677,C3L-01040,C3L-01043,C3L-01045,C3L-01046,C3L-01048,C3L-01049,...,C3N-02788,C3N-03070,C3N-03088,C3N-03180,C3N-03182,C3N-03183,C3N-03184,C3N-03186,C3N-03188,C3N-03473
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.07763,-0.145975,0.821991,-0.064567,-0.763691,1.094879,-0.027903,-0.375754,-0.394736,-0.025968,...,-0.327487,1.942106,0.27851,1.04588,-0.424647,0.230843,-0.635316,0.61664,-0.059547,-0.899255
A2M,0.487228,0.798796,1.09647,0.129385,-1.031834,0.769231,-0.735991,-0.037553,-0.485108,-0.310086,...,-0.340301,1.657565,0.8366,1.151704,-0.733923,0.426624,-0.478657,0.767029,-0.526563,-0.333312


In [11]:
hnscc_proteomics = pd.read_csv(file_hnscc_proteomics, sep='\t', index_col=0).dropna(thresh=1)
print(hnscc_proteomics.shape)
hnscc_proteomics[:2]

(9666, 109)


Unnamed: 0_level_0,C3L-00997,C3N-03849,C3N-03487,C3N-01858,C3L-04791,C3N-04273,C3N-03664,C3N-03781,C3L-00999,C3N-01859,...,C3N-00828,C3N-00297,C3N-00295,C3N-00829,C3N-00306,C3N-00846,C3L-02617,C3N-00498,C3N-00299,C3N-00307
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,27.735214,27.388831,27.792176,27.553753,27.495686,27.351122,28.044127,27.762844,27.949122,27.088057,...,27.730027,27.584042,27.956513,27.748227,27.65942,27.72451,27.452281,27.550777,27.943674,27.59844
A2M,28.724642,28.33688,28.697741,29.045309,28.431308,28.415198,29.13276,29.125029,29.352481,27.945027,...,29.352794,28.292153,29.66099,28.847142,28.891933,28.872764,28.290845,28.306769,29.419415,29.642949


In [12]:
pdac_proteomics = pd.read_csv(file_pdac_proteomics, sep='\t', index_col=0)
pdac_proteomics[:2]

Unnamed: 0,C3L-03394,C3N-03428,C3L-02112,C3N-01719,C3N-03670,C3N-02998,C3N-03211,C3L-01052,C3N-00512,C3N-02295,...,C3L-01031,C3N-01167,C3L-01971,C3L-02604,C3L-00277,C3L-01328,C3L-03639,C3N-04282,C3L-04072,C3L-02613
A1BG,28.674211,28.329656,28.398472,28.69733,28.586148,29.80304,28.384676,28.866906,28.524196,29.083227,...,28.269028,28.578423,28.12797,28.474689,30.154566,28.368512,28.840795,28.845881,27.416247,27.93686
A1CF,24.020346,22.16588,24.021796,23.194085,23.732852,23.3031,23.151656,24.221218,23.011826,22.915697,...,23.077334,24.207681,23.878779,22.441706,23.340567,22.702197,22.604344,22.797497,22.142592,21.469524


#### Pancreatic Ductal Adenocarcinoma

#### TCGA Studies

In [13]:
ovca_2016_proteomics = pd.read_excel(io=file_ovca_2016_proteomics, sheet_name="FullOvarianProteomeData", engine='openpyxl')
# Clean data



In [14]:
ovca_2016_proteomics = ovca_2016_proteomics.set_index('hgnc_symbol').drop('refseq_peptide', axis=1)
ovca_2016_proteomics.columns = [index[index.find('-')+1:] for index in ovca_2016_proteomics.columns]
# taking mean of the replicates the replicates 
ovca_2016_proteomics = ovca_2016_proteomics.groupby(ovca_2016_proteomics.columns, axis=1).mean()
# taking mean of the isoforms 
ovca_2016_proteomics = ovca_2016_proteomics.groupby(ovca_2016_proteomics.index).mean()
# drop datetime like proteins 
ovca_2016_proteomics.drop(index=[index for index in ovca_2016_proteomics.index if type(index) is datetime.datetime], inplace=True)
ovca_2016_proteomics[:2]

Unnamed: 0_level_0,TCGA-09-1664,TCGA-09-2056,TCGA-13-1404,TCGA-13-1409,TCGA-13-1410,TCGA-13-1482,TCGA-13-1483,TCGA-13-1484,TCGA-13-1485,TCGA-13-1487,...,TCGA-61-1919,TCGA-61-1995,TCGA-61-2008,TCGA-61-2087,TCGA-61-2088,TCGA-61-2094,TCGA-61-2095,TCGA-61-2096,TCGA-61-2612,TCGA-61-2613
hgnc_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.31685,-0.6323,-1.0193,-0.6973,-0.429,-0.427,-0.402,-1.06,-0.2143,-0.6443,...,-0.00193,-0.12235,0.63485,-1.1153,-0.8383,-0.0673,0.635,1.91,0.0689,0.9097
A2M,-0.138175,0.00675,-0.86525,-0.98125,-0.00425,-0.46865,-1.12625,-0.688,-0.58425,-1.38325,...,-0.0036,-0.706125,-0.549775,-1.02225,-1.17225,-0.13225,0.91,0.353,0.35,1.16575


In [15]:
brca_2016_proteomics = pd.read_excel(io=file_brca_2016_proteomics, sheet_name="Global-Proteome-G3", engine='openpyxl')
brca_2016_proteomics[:2]

Unnamed: 0,accession_number,numITRAQExperimentsProteinObserved,numSpectraProteinObserved,protein_mw,species,accession_numbers,numPepsUnique,scoreUnique,total percent Coverage across all experiments,subgroupNum,...,AO-A12B.34TCGA,A2-A0SW.35TCGA,AO-A0JL.35TCGA,BH-A0BV.35TCGA,A2-A0YM.36TCGA,BH-A0C7.36TCGA,A2-A0SX.36TCGA,263d3f-I.CPTAC,blcdb9-I.CPTAC,c4155b-C.CPTAC
0,NP_958782,37,43425,533778.0,Human,NP_958782,678,11158.42,89.1,1.1,...,-0.963904,-0.487772,-0.10668,-0.065838,0.65585,-0.552212,-0.39856,0.598585,-0.191285,0.566975
1,NP_958785,37,42994,518637.5,Human,NP_958785,670,11042.11,90.0,1.2,...,-0.93821,-0.487772,-0.10668,-0.055893,0.658143,-0.547749,-0.392601,0.606697,-0.183918,0.578702


In [16]:
#split based on the name and then identify the right geneNames for each protein record
brca_2016_proteomics.loc[:, 'modifiedName'] = brca_2016_proteomics['Name'].str.split('isoform').str[0] 

brca_2016_proteomics['geneName'] = brca_2016_proteomics.groupby(['modifiedName'], group_keys=False)['geneName']\
                                                       .apply(lambda x: x.ffill().bfill())

# #Even after filling if there exists NaN values for gene - we drop them
brca_2016_proteomics.dropna(subset=['geneName'], inplace=True)

brca_2016_proteomics = brca_2016_proteomics.groupby(['geneName']).mean(numeric_only=True)

# Modifying sample names to suit the samples in transcriptomic data
brca_2016_proteomics.columns = [i.replace('.', '-') for i in brca_2016_proteomics.columns]
brca_2016_proteomics = brca_2016_proteomics.loc[:, brca_2016_proteomics.columns.str.contains('TCGA')]
brca_2016_proteomics = brca_2016_proteomics.rename(columns = lambda x : 'TCGA-' + str(x)[:7])
# Aggregate the duplicated columns present
brca_2016_proteomics = brca_2016_proteomics.groupby(brca_2016_proteomics.columns, axis=1).mean(numeric_only=True)
brca_2016_proteomics[:2]

Unnamed: 0_level_0,TCGA-A2-A0CM,TCGA-A2-A0D2,TCGA-A2-A0EQ,TCGA-A2-A0EV,TCGA-A2-A0EX,TCGA-A2-A0EY,TCGA-A2-A0SW,TCGA-A2-A0SX,TCGA-A2-A0T3,TCGA-A2-A0T6,...,TCGA-C8-A12Z,TCGA-C8-A130,TCGA-C8-A131,TCGA-C8-A134,TCGA-C8-A135,TCGA-C8-A138,TCGA-D8-A142,TCGA-E2-A154,TCGA-E2-A158,TCGA-E2-A15A
geneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.899882,-0.1054,-0.380704,1.699253,-0.390673,1.849433,1.560915,2.369127,-0.00341,2.519514,...,1.775668,0.850687,-0.294419,0.320314,2.185114,-2.018457,1.539879,1.856193,-1.987341,-1.277926
A2BP1,0.911162,0.882545,-0.874399,-2.084358,1.796662,-0.578121,1.50624,-0.520707,0.120195,-1.671573,...,-1.859222,1.00812,0.737591,1.945039,-2.473849,0.329426,0.957906,0.520191,0.291579,-2.101058


In [17]:
assert len(ccrcc_proteomics.columns[ccrcc_proteomics.columns.duplicated()]) == 0, "ccRCC columns contain duplicates"
# assert len(coad_proteomics.columns[coad_proteomics.columns.duplicated()]) == 0, "Colon columns contain duplicates"
assert len(endo_proteomics.columns[endo_proteomics.columns.duplicated()]) == 0, "UCEC columns contain duplicates"
assert len(luad_proteomics.columns[luad_proteomics.columns.duplicated()]) == 0, "LUAD columns contain duplicates"
assert len(brca_2020_proteomics.columns[brca_2020_proteomics.columns.duplicated()]) == 0, "BrCa columns contain duplicates"
assert len(gbm_proteomics.columns[gbm_proteomics.columns.duplicated()]) == 0, "GBM columns contain duplicates"
assert len(hnscc_proteomics.columns[hnscc_proteomics.columns.duplicated()]) == 0, "HNSCC columns contain duplicates"
assert len(lscc_proteomics.columns[lscc_proteomics.columns.duplicated()]) == 0, "LSCC columns contain duplicates"
assert len(ovca_2020_proteomics.columns[ovca_2020_proteomics.columns.duplicated()]) == 0, "OvCa 2020 columns contain duplicates"
assert len(ovca_2016_proteomics.columns[ovca_2016_proteomics.columns.duplicated()]) == 0, "OvCa 2016 columns contain duplicates"
assert len(brca_2016_proteomics.columns[brca_2016_proteomics.columns.duplicated()]) == 0, "BrCa 2016 columns contain duplicates"
assert len(pdac_proteomics.columns[pdac_proteomics.columns.duplicated()]) == 0, "Pdac columns contain duplicates"

In [18]:
combined_proteomics = pd.concat([ccrcc_proteomics, endo_proteomics, 
                                 luad_proteomics, brca_2020_proteomics, gbm_proteomics, 
                                 hnscc_proteomics, ovca_2020_proteomics, lscc_proteomics, 
                                 pdac_proteomics, brca_2016_proteomics, ovca_2016_proteomics], 
                                axis=1)

print("Dimensions: ", combined_proteomics.shape)
combined_proteomics[:2]

Dimensions:  (16854, 1227)


Unnamed: 0,C3L-00004,C3L-00010,C3L-00011,C3L-00026,C3L-00079,C3L-00088,C3L-00096,C3L-00097,C3L-00103,C3L-00183,...,TCGA-61-1919,TCGA-61-1995,TCGA-61-2008,TCGA-61-2087,TCGA-61-2088,TCGA-61-2094,TCGA-61-2095,TCGA-61-2096,TCGA-61-2612,TCGA-61-2613
A1BG,-0.304302,1.195915,-0.286155,0.13573,-0.123959,0.427542,-0.242107,0.506469,0.720836,0.082946,...,-0.00193,-0.12235,0.63485,-1.1153,-0.8383,-0.0673,0.635,1.91,0.0689,0.9097
A1CF,0.641447,0.19462,-0.780455,0.404286,-0.677773,0.310249,-0.128732,-0.513243,-1.135859,-0.128068,...,,,,,,,,,,


In [19]:
combined_proteomics.drop(index=[np.nan], inplace=True)

In [20]:
combined_proteomics.index[combined_proteomics.index.duplicated()]

Index([], dtype='object')

In [21]:
combined_proteomics.columns[combined_proteomics.columns.duplicated()]

Index([], dtype='object')

#### Filtering the proteins and samples

In [22]:
# Restrict the data to contain only the protein-coding genes 
protein_coding_genes = pd.read_csv(file_protein_coding_genes, sep='\t')
protein_coding_genes = protein_coding_genes[(protein_coding_genes.status == 'Approved') & 
                                            (protein_coding_genes['Locus type'] == 'Gene with protein product')]['Symbol']
protein_coding_genes[:3]

0    A1BG
1    A1CF
2     A2M
Name: Symbol, dtype: object

In [23]:
cptac_samples = pd.read_parquet(file_cptac_samples_info)
cptac_samples[:2]

Unnamed: 0,Gender,Age,Study
C3L-00004,male,72.0,ccRCC
C3L-00010,male,30.0,ccRCC


In [24]:
common_protein_coding_genes = np.intersect1d(combined_proteomics.index, list(protein_coding_genes))
common_samples = np.intersect1d(combined_proteomics.columns, cptac_samples.index)
print("Common samples = {0} and common proteins = {1}".format(len(common_samples), len(common_protein_coding_genes)))
combined_proteomics = combined_proteomics.reindex(common_protein_coding_genes).reindex(common_samples, axis=1)
print("Dimensions of combined CPTAC proteomics database: ", combined_proteomics.shape)
combined_proteomics[:5]

Common samples = 1227 and common proteins = 14792
Dimensions of combined CPTAC proteomics database:  (14792, 1227)


Unnamed: 0,01OV007,01OV017,01OV018,01OV023,01OV026,01OV029,01OV030,01OV039,01OV041,01OV047,...,X20BR002,X20BR005,X20BR006,X20BR007,X20BR008,X21BR001,X21BR002,X21BR010,X22BR005,X22BR006
A1BG,0.133634,-0.432786,-0.013943,1.234203,-0.482872,-0.956518,-0.102617,-1.318564,-0.343398,0.336833,...,1.8732,-0.4227,1.5862,-0.297,1.6767,-0.661,-1.3735,1.1583,0.4948,0.5049
A1CF,,,,,,,,,,,...,,,,,,,,,,
A2M,0.76512,-0.461906,0.065842,1.512082,0.343872,-0.312811,-0.338799,-0.492472,-0.523344,0.063341,...,1.5261,-1.911,1.6519,1.3457,1.7907,-0.6402,0.4227,0.3329,-1.0986,-0.6582
A2ML1,-0.778849,-0.533497,,-0.70305,-1.150936,-0.556404,-0.052664,,-1.139941,0.623607,...,4.7528,4.0898,2.4992,3.0502,-2.9086,-4.8578,-4.9553,-5.7358,-8.8314,-7.4699
A4GALT,,,,,,,,,,,...,,,,,,,,,,


In [25]:
combined_proteomics.to_parquet(get_tables_path('CPTAC_proteomics_linkedomics.parquet'))