In [1]:
import numpy as np
import pandas as pd
import glob
from sklearn.preprocessing import LabelEncoder
import os
import shutil

In [2]:
strClinicalInformationPath = "./clinical_information/"
strEmbeddingPath = "./../Fairness-Foundation/AI_fairness/"
strDataPath = "./../nas/TCGA/Preprocessing/"

sensitive = {"gender": ["female", "male"]}

In [3]:
def fClinicalInformation(cancer):
    df = pd.DataFrame({})
    for c in cancer:
        part = pd.read_pickle(glob.glob(f'{strDataPath}/*{c}/clinical_information.pkl')[0])
        df = pd.concat([df, part], ignore_index = True)
    return df

In [4]:
def fReduceDataFrame(cancer, df, sensitive):
    if(len(cancer) == 1):
        df = df[['case_submitter_id', list(sensitive.keys())[0], 'primary_diagnosis', 'project_id']]
    else:
        df = df[['case_submitter_id', list(sensitive.keys())[0], 'project_id', 'project_id']]
    df.columns = ['case_submitter_id', 'sensitive', 'label', 'project_id']
    return df

In [5]:
def fTransSensitive(sensitive, df):
    substrings = sensitive[list(sensitive.keys())[0]]
    df = df[[any(x in y for x in substrings) for y in df['sensitive'].tolist()]]
    return df

In [6]:
data_path = './../nas/TCGA/Preprocessing/'
cancers = glob.glob(f'{data_path}/*')
print(cancers)

['./../nas/TCGA/Preprocessing/01_BRCA', './../nas/TCGA/Preprocessing/02_GBM', './../nas/TCGA/Preprocessing/03_OV', './../nas/TCGA/Preprocessing/04_LUAD', './../nas/TCGA/Preprocessing/05_UCEC', './../nas/TCGA/Preprocessing/06_KIRC', './../nas/TCGA/Preprocessing/07_HNSC', './../nas/TCGA/Preprocessing/08_LGG', './../nas/TCGA/Preprocessing/09_THCA', './../nas/TCGA/Preprocessing/10_LUSC', './../nas/TCGA/Preprocessing/11_PRAD', './../nas/TCGA/Preprocessing/12_SKCM', './../nas/TCGA/Preprocessing/13_COAD', './../nas/TCGA/Preprocessing/14_STAD', './../nas/TCGA/Preprocessing/15_BLCA', './../nas/TCGA/Preprocessing/16_LIHC', './../nas/TCGA/Preprocessing/17_CESC', './../nas/TCGA/Preprocessing/18_KIRP', './../nas/TCGA/Preprocessing/19_SARC', './../nas/TCGA/Preprocessing/20_LAML', './../nas/TCGA/Preprocessing/21_ESCA', './../nas/TCGA/Preprocessing/22_PAAD', './../nas/TCGA/Preprocessing/23_PCPG', './../nas/TCGA/Preprocessing/24_READ', './../nas/TCGA/Preprocessing/25_TGCT', './../nas/TCGA/Preprocessing

In [7]:
def getCancerClassification(cancerType, sensitive):
    dfClinicalInformation = fClinicalInformation(cancerType)
    dfClinicalInformation = fReduceDataFrame(cancerType, dfClinicalInformation.drop_duplicates(subset = 'case_submitter_id', ignore_index = True), sensitive)
    
    lsDownloadPath = glob.glob(f'{strEmbeddingPath}/*.pt')
    lsDownloadFoldID = [s.split('/')[-1][:-3] for s in lsDownloadPath]
    intTumor = 0
    if(intTumor == 0):
        lsDownloadFoldID = np.array(lsDownloadFoldID)[[s[13] == '0' for s in lsDownloadFoldID]].tolist()
    elif(intTumor == 1):
        lsDownloadFoldID = np.array(lsDownloadFoldID)[[s[13] != '0' for s in lsDownloadFoldID]].tolist()

    lsDownloadCaseSubmitterId = [s[:12] for s in lsDownloadFoldID]
    dfDownload = pd.DataFrame({
                'case_submitter_id': lsDownloadCaseSubmitterId,
                'folder_id': lsDownloadFoldID
            })
    dfClinicalInformation = pd.merge(dfClinicalInformation, dfDownload, on = "case_submitter_id")

    intDiagnosticSlide = 0       # 0 for formalin
    if(intDiagnosticSlide == 0):
        dfClinicalInformation = dfClinicalInformation[['DX' in s[20:22] for s in dfClinicalInformation['folder_id'].tolist()]].reset_index(drop = True)
    elif(intDiagnosticSlide == 1):
        dfClinicalInformation = dfClinicalInformation[['DX' not in s[20:22] for s in dfClinicalInformation['folder_id'].tolist()]].reset_index(drop = True)

    # count = {}
    if len(dfClinicalInformation) > 1:
        # a0_count = len(dfClinicalInformation[dfClinicalInformation['sensitive'] == sensitive[list(sensitive.keys())[0]][0]])
        # a1_count = len(dfClinicalInformation[dfClinicalInformation['sensitive'] == sensitive[list(sensitive.keys())[0]][1]])
        # print(f'{sensitive[list(sensitive.keys())[0]][0]}_count: {a0_count}, {sensitive[list(sensitive.keys())[0]][1]}_count: {a1_count}, total: {a0_count + a1_count}')
        # count = {f'{sensitive[list(sensitive.keys())[0]][0]}_count': a0_count, f'{sensitive[list(sensitive.keys())[0]][1]}_count': a1_count, 'total': a0_count + a1_count}
   
        le = LabelEncoder()
        if(len(cancerType) == 1):
            positive = ['Infiltrating duct and lobular carcinoma', 'Infiltrating duct carcinoma', 'Infiltrating duct mixed with other types of carcinoma']
            negative = ['Infiltrating lobular mixed with other types of carcinoma', 'Lobular carcinoma']
            bags = [positive, negative]
            labels = dfClinicalInformation['label'].tolist()
            replace = []
            for lb in labels:
                flag = True
                for idx, conds in enumerate(bags):
                    for cond in conds:
                        if cond in lb:
                            replace += [idx]
                            flag = False
                            break
                        if(flag == False):
                            break
                    if(flag == False):
                        break
                if(flag == True):
                    replace += [2]
            dfClinicalInformation['label'] = replace
            dfClinicalInformation = dfClinicalInformation[dfClinicalInformation['label'] != 2]
            leLabel = bags
        else:
            dfClinicalInformation.label = le.fit_transform(dfClinicalInformation.label.values)
            leLabel = le.classes_

        dictInformation = {}
        dictInformation['label'] = leLabel
        
        dfClinicalInformation = fTransSensitive(sensitive, dfClinicalInformation).reset_index(drop = True)
        dfClinicalInformation.sensitive = le.fit_transform(dfClinicalInformation.sensitive.values)
        leSensitive = le.classes_
        dictInformation['sensitive'] = leSensitive
    return dfClinicalInformation

In [21]:
cancers = [["LUAD", "LUSC"], ["KIRP", "KIRC", "KICH"], ["KIRP", "KIRC"], ["KIRP", "KICH"], ["KIRC", "KICH"], ["COAD", "READ"], ["GBM", "LGG"]]
data = []
for cancer in cancers:
    print(cancer)
    dfClinicalInformationOthers = getCancerClassification(cancer, sensitive)
    if len(dfClinicalInformationOthers) < 1:
        continue
    else:
        groups = dfClinicalInformationOthers.groupby(['label', 'sensitive'])
        if(len(cancer) <= 2):
            count_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 0].shape[0] 
            count_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 1].shape[0]
            sensitive_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 0].shape[0]
            sensitive_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 1].shape[0]
            print(f'Label 0 count: {count_0}, Label 1 count: {count_1}')
            print(f'Sensitive female count: {sensitive_0}, Sensitive male: {sensitive_1}')
            tmp = {}
            for group, df in groups:
                print(f'({group[0]}, {group[1]}), count={df.shape[0]}')
                tmp[f'({group[0]}, {group[1]})'] = df.shape[0]
            data.append((cancer, tmp['(0, 0)'], tmp['(0, 1)'], tmp['(1, 0)'], tmp['(1, 1)'], count_0, count_1, sensitive_0, sensitive_1, len(dfClinicalInformationOthers)))
        else:
            count_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 0].shape[0] 
            count_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 1].shape[0]
            count_2 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 2].shape[0]
            sensitive_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 0].shape[0]
            sensitive_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 1].shape[0]
            print(f'Label 0 count: {count_0}, Label 1 count: {count_1}', f'Label 2 count: {count_2}')
            print(f'Sensitive female count: {sensitive_0}, Sensitive male count: {sensitive_1}')
            tmp = {}
            for group, df in groups:
                print(f'({group[0]}, {group[1]}), count={df.shape[0]}')
                tmp[f'({group[0]}, {group[1]})'] = df.shape[0]
            data.append((cancer, tmp['(0, 0)'], tmp['(0, 1)'], tmp['(1, 0)'], tmp['(1, 1)'], tmp['(2, 0)'], tmp['(2, 1)'], count_0, count_1, count_2, sensitive_0, sensitive_1, len(dfClinicalInformationOthers)))
        print(f'total: {len(dfClinicalInformationOthers)}')
        for c in cancer:
            ori_path = glob.glob(f'{data_path}/*{c}/clinical_information.pkl')[0]
            shutil.copy(ori_path, f'./clinical_information/{c}_clinical_information.pkl')
        new_path_name = "_".join(cancer)
        dfClinicalInformationOthers.to_csv(f'./datasets/gender/cancerClassification/{new_path_name}_cancerClassification.csv', index=False)

['LUAD', 'LUSC']
Label 0 count: 531, Label 1 count: 512
Sensitive female count: 421, Sensitive male: 622
(0, 0), count=296
(0, 1), count=235
(1, 0), count=125
(1, 1), count=387
total: 1043
['KIRP', 'KIRC', 'KICH']
Label 0 count: 109, Label 1 count: 519 Label 2 count: 297
Sensitive female count: 309, Sensitive male count: 616
(0, 0), count=50
(0, 1), count=59
(1, 0), count=184
(1, 1), count=335
(2, 0), count=75
(2, 1), count=222
total: 925
['KIRP', 'KIRC']
Label 0 count: 519, Label 1 count: 297
Sensitive female count: 259, Sensitive male: 557
(0, 0), count=184
(0, 1), count=335
(1, 0), count=75
(1, 1), count=222
total: 816
['KIRP', 'KICH']
Label 0 count: 109, Label 1 count: 297
Sensitive female count: 125, Sensitive male: 281
(0, 0), count=50
(0, 1), count=59
(1, 0), count=75
(1, 1), count=222
total: 406
['KIRC', 'KICH']
Label 0 count: 109, Label 1 count: 519
Sensitive female count: 234, Sensitive male: 394
(0, 0), count=50
(0, 1), count=59
(1, 0), count=184
(1, 1), count=335
total: 628

In [22]:
df_test = pd.read_pickle(f'./clinical_information/LUAD_clinical_information.pkl')
print(df_test)

     case_submitter_id project_id  gender   race    primary_diagnosis  \
0         TCGA-50-5051  TCGA-LUAD  female  white  Adenocarcinoma, NOS   
1         TCGA-50-5051  TCGA-LUAD  female  white  Adenocarcinoma, NOS   
2         TCGA-50-5051  TCGA-LUAD  female  white  Adenocarcinoma, NOS   
3         TCGA-50-5051  TCGA-LUAD  female  white  Adenocarcinoma, NOS   
4         TCGA-62-A471  TCGA-LUAD    male  white  Adenocarcinoma, NOS   
...                ...        ...     ...    ...                  ...   
3211      TCGA-55-7574  TCGA-LUAD  female  white  Adenocarcinoma, NOS   
3212      TCGA-55-7574  TCGA-LUAD  female  white  Adenocarcinoma, NOS   
3213      TCGA-55-7574  TCGA-LUAD  female  white  Adenocarcinoma, NOS   
3214      TCGA-55-7574  TCGA-LUAD  female  white  Adenocarcinoma, NOS   
3215      TCGA-55-7574  TCGA-LUAD  female  white  Adenocarcinoma, NOS   

     ajcc_pathologic_stage                                          folder_id  \
0               Stage IIIA  TCGA-50-5051-0