In [1]:
import os
import cptac
import numpy as np
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

In [2]:
get_data_path = lambda folders, fname: os.path.normpath(os.environ['DATA_PATH']+'/'+'/'.join(folders) +'/'+ fname)
get_tables_path = lambda fname: os.path.normpath('../local_data/processed_data/'+ fname)

file_brca_2016_clinical = get_data_path(['tumour_studies','brca', 'mertins_2016'], 'combined_study_clinical_data.tsv')
file_ovca_2016_clinical = get_data_path(['tumour_studies','ovca', 'zhang_2016'], 'Table_S1.xlsx')

In [3]:
cptac.download('Ccrcc', version='0.1.1')
cptac.download('Endometrial', version='2.1.1')
cptac.download('Luad', version='3.1.1')
cptac.download('Brca', version='5.4')
cptac.download('Gbm', version='3.0')
cptac.download('Hnscc', version='2.0')
cptac.download('Lscc', version='3.3')
# Ovarian study they have the 2020 confirmatory study of HGSC Ovarian cancer but the link is specified for 2016 study. 
cptac.download('Ovarian', version='0.0.1')
cptac.download('Pdac', version='1.0')

                                                

True

In [4]:
cptac.list_datasets()

Unnamed: 0_level_0,Description,Data reuse status,Publication link
Dataset name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brca,breast cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33212010/
Ccrcc,clear cell renal cell carcinoma (kidney),no restrictions,https://pubmed.ncbi.nlm.nih.gov/31675502/
Colon,colorectal cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/31031003/
Endometrial,endometrial carcinoma (uterine),no restrictions,https://pubmed.ncbi.nlm.nih.gov/32059776/
Gbm,glioblastoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33577785/
Hnscc,head and neck squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33417831/
Lscc,lung squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/34358469/
Luad,lung adenocarcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/32649874/
Ovarian,high grade serous ovarian cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/27372738/
Pdac,pancreatic ductal adenocarcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/34534465/


In [5]:
ccrcc = cptac.Ccrcc(version='0.1.1')
endo = cptac.Endometrial(version='2.1.1')
luad = cptac.Luad(version='3.1.1')
brca = cptac.Brca(version='5.4')
gbm = cptac.Gbm(version='3.0')
hnscc = cptac.Hnscc(version='2.0')
lscc = cptac.Lscc(version='3.3')
ovca_2020 = cptac.Ovarian(version='0.0.1')
pdac = cptac.Pdac(version='1.0')

                                                

In [6]:
# Regress out the clinical info 
def get_sample_info(data, study):
    if((study=='ccRCC') | (study=='GBM') | (study=='HNSCC')):
        sample_info = data.get_clinical(tissue_type='tumor')[['gender', 'age']]
    elif(study=='Pdac'):
        sample_info = data.get_clinical(tissue_type='tumor')[['sex', 'age']]
    elif(study=='BrCa2020'):
        sample_info = data.get_clinical(tissue_type='tumor')[['Gender', 'Age.in.Month']]
    elif(study=='OvCa2020'):
        sample_info = data.get_clinical(tissue_type='tumor')[['Participant_Procurement_Age', 'Participant_Gender']]
    else:
        sample_info = data.get_clinical(tissue_type='tumor')[['Gender', 'Age']]
    sample_info = pd.DataFrame(sample_info).rename(columns={'gender': 'Gender', 'Participant_Gender': 'Gender', 
                                                            'sex': 'Gender' , 'age': 'Age', 
                                                            'Participant_Procurement_Age': 'Age', 
                                                            'Age.in.Month': 'Age'})
    sample_info['Gender'] = sample_info['Gender'].str.lower()
    sample_info['Study'] = study
    return sample_info

In [7]:
ccrcc_sample_info = get_sample_info(ccrcc, 'ccRCC')
endo_sample_info = get_sample_info(endo, 'UCEC')
brca_2020_sample_info = get_sample_info(brca, 'BrCa2020')
gbm_sample_info = get_sample_info(gbm, 'GBM')
hnscc_sample_info = get_sample_info(hnscc, 'HNSCC')
lscc_sample_info = get_sample_info(lscc, 'LSCC')
luad_sample_info = get_sample_info(luad, 'LUAD')
ovca_2020_sample_info = get_sample_info(ovca_2020, 'OvCa2020')
pdac_sample_info = get_sample_info(pdac, 'Pdac')
# Brca and Colon Samples have age in months, so convert them to years like the other studies
ovca_2020_sample_info[['Age']]/=12
brca_2020_sample_info[['Age']]/=12
ccrcc_sample_info[['Age']] = ccrcc_sample_info[['Age']].replace('>=90', 90)
# Drop control samples from HNSCC
hnscc_sample_info = hnscc_sample_info[~hnscc_sample_info.index.str.contains(".C")]

In [8]:
ovca_2016_sample_info = pd.read_excel(file_ovca_2016_clinical, sheet_name='KeyClinicalData',
                                      index_col=0, usecols=['bcr_patient_barcode', 'age_at_diagnosis'])
ovca_2016_sample_info.rename(columns={'age_at_diagnosis': 'Age'}, inplace=True)
ovca_2016_sample_info['Gender']='female'
ovca_2016_sample_info['Study']='OvCa2016'
ovca_2016_sample_info[:3]

Unnamed: 0_level_0,Age,Gender,Study
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-09-1664,37,female,OvCa2016
TCGA-09-2056,62,female,OvCa2016
TCGA-13-1404,48,female,OvCa2016


In [9]:
brca_2016_sample_info = pd.read_csv(file_brca_2016_clinical, usecols=['Patient ID', 'Age', 'Sex'], sep='\t', index_col=0)
# restricting to TCGA samples only
brca_2016_sample_info = brca_2016_sample_info.reindex([index for index in brca_2016_sample_info.index if 'TCGA' in index])
brca_2016_sample_info.rename(columns={'Sex': 'Gender'}, inplace=True)
brca_2016_sample_info['Gender'] = brca_2016_sample_info['Gender'].str.lower()
brca_2016_sample_info['Study']='BrCa2016'
brca_2016_sample_info[:2]

Unnamed: 0_level_0,Age,Gender,Study
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-A1-A0SB,70.0,female,BrCa2016
TCGA-A1-A0SD,59.0,female,BrCa2016


In [10]:
combined_sample = pd.concat([ccrcc_sample_info, endo_sample_info, brca_2020_sample_info, gbm_sample_info, 
                             hnscc_sample_info, lscc_sample_info, luad_sample_info, ovca_2020_sample_info, pdac_sample_info, 
                             brca_2016_sample_info, ovca_2016_sample_info])
combined_sample['Gender'].value_counts()

Gender
female    1510
male       471
Name: count, dtype: int64

In [11]:
combined_sample['Study'].value_counts()

Study
BrCa2016    825
OvCa2016    174
Pdac        145
BrCa2020    122
LSCC        116
HNSCC       111
OvCa2020    111
ccRCC       110
LUAD        110
GBM         105
UCEC         95
Name: count, dtype: int64

In [12]:
combined_sample.isnull().sum()

Gender    43
Age       43
Study      0
dtype: int64

In [13]:
isnull_indices = combined_sample[(combined_sample['Gender'].isnull()==True) | (combined_sample['Age'].isnull()==True)].index
combined_sample.loc[isnull_indices]

Unnamed: 0,Gender,Age,Study
CPT000814,,,BrCa2020
CPT001846,,,BrCa2020
X01BR008,,,BrCa2020
X01BR009,,,BrCa2020
X01BR010,,,BrCa2020
X01BR020,,,BrCa2020
X01BR023,,,BrCa2020
X01BR026,,,BrCa2020
X01BR040,,,BrCa2020
X01BR042,,,BrCa2020


In [14]:
combined_sample.drop(index=['Samples-altered', 'Samples-amp', 'Samples-del'], inplace=True)

In [15]:
# impute with the median value if the column type is number, else impute with mode 
f = lambda x: x.median() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
combined_sample_imputed = combined_sample.fillna(combined_sample.groupby('Study').transform(f))

In [16]:
combined_sample_imputed.loc[[index for index in isnull_indices if 'Sample' not in index]]

Unnamed: 0,Gender,Age,Study
CPT000814,female,50.0,BrCa2020
CPT001846,female,50.0,BrCa2020
X01BR008,female,50.0,BrCa2020
X01BR009,female,50.0,BrCa2020
X01BR010,female,50.0,BrCa2020
X01BR020,female,50.0,BrCa2020
X01BR023,female,50.0,BrCa2020
X01BR026,female,50.0,BrCa2020
X01BR040,female,50.0,BrCa2020
X01BR042,female,50.0,BrCa2020


In [17]:
combined_sample_imputed = combined_sample_imputed.astype({'Age':'float64'})

In [18]:
combined_sample_imputed.to_parquet(get_tables_path('CPTAC_sample_info.parquet'))