In [1]:
import numpy as np
import pandas as pd
import glob
from sklearn.preprocessing import LabelEncoder
import os
import shutil

In [2]:
strClinicalInformationPath = "./clinical_information/"
strEmbeddingPath = "./../nas/TCGA/CHIEF_features/"
strDataPath = "./../nas/TCGA/Preprocessing/"

sensitive = {"race": ["white", "black or african american"]}

In [3]:
def fClinicalInformation(cancer):
    name = cancer.split("_")[-1]
    df = pd.DataFrame({})
    part = pd.read_pickle(
        glob.glob(f"{cancer}/clinical_information.pkl")[0]
    )
    part2 = pd.read_csv(glob.glob(f"{cancer}/clinical.tsv")[0], sep="\t")
    merged = pd.merge(
        part,
        part2[["case_submitter_id", "days_to_death", "days_to_last_follow_up", "age_at_index"]],
        on="case_submitter_id",
        how="left",
    )
    df = pd.concat([df, merged], ignore_index=True)
    return df

In [4]:
def fReduceDataFrame(df):
    dfClinicalInformation = df.copy()
    mask = (df['days_to_death'] == '\'--') & (df['days_to_last_follow_up'] == '\'--')
    dfClinicalInformation = dfClinicalInformation[~mask].reset_index(drop = True)

    dfClinicalInformation['event'] = dfClinicalInformation['days_to_death'].apply(lambda x: 1 if x != '\'--' else 0)  # 1: death 0: alive
    mask2 = dfClinicalInformation['days_to_death'] != '\'--'
    dfClinicalInformation.loc[mask2, 'T'] = dfClinicalInformation.loc[mask2, 'days_to_death']
    dfClinicalInformation.loc[~mask2, 'T'] = dfClinicalInformation.loc[~mask2, 'days_to_last_follow_up']

    # dfClinicalInformation = dfClinicalInformation[['case_submitter_id', list(sensitive.keys())[0], 'T', 'event']]
    # dfClinicalInformation.columns = ['case_submitter_id', 'sensitive', 'T', 'event']
    return dfClinicalInformation

In [5]:
def fTransSensitive(sensitive, df):
    substrings = sensitive[list(sensitive.keys())[0]]
    df = df[[any(x in y for x in substrings) for y in df["sensitive"].tolist()]]
    return df

In [11]:
def getCancerCalculation(path, cancerType, sensitive):
    dfClinicalInformation = fClinicalInformation(path)
    dfClinicalInformation = fReduceDataFrame(dfClinicalInformation.drop_duplicates(subset = 'case_submitter_id', ignore_index = True))
    lsDownloadPath = glob.glob(f'{strEmbeddingPath}/*{cancerType}/*.pt')
    lsDownloadFoldID = [s.split('/')[-1][:-3] for s in lsDownloadPath]
    # intTumor = 0
    # if(intTumor == 0):
    #     lsDownloadFoldID = np.array(lsDownloadFoldID)[[s[13] == '0' for s in lsDownloadFoldID]].tolist()
    # elif(intTumor == 1):
    #     lsDownloadFoldID = np.array(lsDownloadFoldID)[[s[13] != '0' for s in lsDownloadFoldID]].tolist()

    lsDownloadCaseSubmitterId = [s[:12] for s in lsDownloadFoldID]
    dfDownload = pd.DataFrame({
                'case_submitter_id': lsDownloadCaseSubmitterId,
                'folder_id': lsDownloadFoldID
            })
    dfClinicalInformation = pd.merge(dfClinicalInformation, dfDownload, on = "case_submitter_id")
    stage_map = {
        'Stage I': 0,
        'Stage IA': 0,
        'Stage IB': 0,
        'Stage II': 1,
        'Stage IIA': 1,
        'Stage IIB': 1,
        'Stage IIC': 1
    }
    dfClinicalInformation['stage'] = dfClinicalInformation['ajcc_pathologic_stage'].map(stage_map)

    female_count = len(dfClinicalInformation[dfClinicalInformation['gender'] == 'female'])
    male_count = len(dfClinicalInformation[dfClinicalInformation['gender'] == 'male'])
    gender_count = len(dfClinicalInformation) - female_count - male_count
    white_count = len(dfClinicalInformation[dfClinicalInformation["race"] == "white"])
    black_count = len(dfClinicalInformation[dfClinicalInformation["race"] == "black or african american"])
    race_count = len(dfClinicalInformation) - white_count - black_count
    numeric_age = pd.to_numeric(dfClinicalInformation['age_at_index'], errors='coerce')
    valid_age = numeric_age.dropna()
    age_mean = valid_age.mean()
    age_std = valid_age.std()
    stage_I_count = len(dfClinicalInformation[dfClinicalInformation["stage"] == 0])
    stage_II_count = len(dfClinicalInformation[dfClinicalInformation["stage"] == 1])
    stage_count = len(dfClinicalInformation) - stage_I_count - stage_II_count
    count = {
        "cancer": cancerType,
        "age_mean": round(age_mean, 2) if not np.isnan(age_mean) else 'N/A',
        "age_std": round(age_std, 2) if not np.isnan(age_std) else 'N/A',
        "female": female_count,
        "male": male_count,
        "gender_NA": gender_count,
        "white": white_count,
        "black": black_count,
        "race_NA": race_count,
        "stage_I": stage_I_count,
        "stage_II": stage_II_count,
        "stage_NA": stage_count
    }

    return dfClinicalInformation, count

In [13]:
cancers = glob.glob(f"./../nas/TCGA/Preprocessing/*")
data = []
for cancer in cancers:
    name = cancer.split('/')[-1].split('_')[-1]
    print(name)
    try:
        dfClinicalInformation, count = getCancerCalculation(cancer, name, sensitive)
        if len(dfClinicalInformation) < 1:
            print('empty')
            continue
        else:
            data.append(count)
            print(len(dfClinicalInformation))
    except:
        pass
data = pd.DataFrame(data)
print(data)
data.to_csv('cancer_info.csv', index = False)

BRCA
3043
GBM
2008
OV
1379
LUAD
1574
UCEC
1356
KIRC
2160
HNSC
1197
LGG
1560
THCA
1158
LUSC
1597
PRAD
1164
SKCM
931
COAD
1414
STAD
1140
BLCA
924
LIHC
760
CESC
604
KIRP
765
SARC
890
LAML
ESCA
396
PAAD
466
PCPG
385
READ
519
TGCT
353
THYM
315
KICH
322
ACC
323
MESO
173
UVM
150
DLBC
103
UCS
150
CHOL
107
   cancer  age_mean  age_std  female  male  gender_NA  white  black  race_NA  \
0    BRCA     58.23    13.33    3008    35          0   2196    436      411   
1     GBM     57.35    14.17     760  1248          0   1741    158      109   
2      OV     59.45    11.34    1379     0          0   1185     77      117   
3    LUAD     65.21    10.13     875   699          0   1239    160      175   
4    UCEC     63.77    11.06    1356     0          0    923    275      158   
5    KIRC     60.77    12.07     761  1399          0   1935    161       64   
6    HNSC     61.26    11.92     326   871          0   1027    113       57   
7     LGG     42.89    13.59     668   892          0   1422 