In [1]:
import numpy as np
import pandas as pd
import glob
from sklearn.preprocessing import LabelEncoder
import os
import shutil

In [2]:
strClinicalInformationPath = "./clinical_information/"
strEmbeddingPath = "./../Fairness-Foundation/AI_fairness/"
strDataPath = "./../nas/TCGA/Preprocessing/"

sensitive = {"race": ["white", "black or african american"]}

In [3]:
def fClinicalInformation(cancer):
    df = pd.DataFrame({})
    part = pd.read_pickle(glob.glob(f'{strClinicalInformationPath}/{cancer}_clinical_information.pkl')[0])
    part_df = pd.concat([df, part], ignore_index=True)
    label = pd.read_pickle(glob.glob(f'{strClinicalInformationPath}/msi.pkl')[0])
    label_df = pd.DataFrame(list(label.items()), columns=['case_submitter_id', 'label'])
    df = pd.merge(part_df, label_df, on = 'case_submitter_id')
    return df

In [4]:
def fReduceDataFrame(cancer, df, sensitive):
    df = df[['case_submitter_id', list(sensitive.keys())[0], 'label']]
    df.columns = ['case_submitter_id', 'sensitive', 'label']
    return df

In [5]:
def fTransSensitive(sensitive, df):
    substrings = sensitive[list(sensitive.keys())[0]]
    df = df[[any(x in y for x in substrings) for y in df['sensitive'].tolist()]]
    return df

In [6]:
def getGeneticClassification(cancerType, sensitive):
    dfClinicalInformation = fClinicalInformation(cancerType)
    dfClinicalInformation = fReduceDataFrame(cancerType, dfClinicalInformation.drop_duplicates(subset = 'case_submitter_id', ignore_index = True), sensitive)
    
    lsDownloadPath = glob.glob(f'{strEmbeddingPath}/*.pt')
    lsDownloadFoldID = [s.split('/')[-1][:-3] for s in lsDownloadPath]
    # intTumor = 0
    # if(intTumor == 0):
    #     lsDownloadFoldID = np.array(lsDownloadFoldID)[[s[13] == '0' for s in lsDownloadFoldID]].tolist()
    # elif(intTumor == 1):
    #     lsDownloadFoldID = np.array(lsDownloadFoldID)[[s[13] != '0' for s in lsDownloadFoldID]].tolist()

    lsDownloadCaseSubmitterId = [s[:12] for s in lsDownloadFoldID]
    dfDownload = pd.DataFrame({
                'case_submitter_id': lsDownloadCaseSubmitterId,
                'folder_id': lsDownloadFoldID
            })
    dfClinicalInformation = pd.merge(dfClinicalInformation, dfDownload, on = "case_submitter_id")

    # intDiagnosticSlide = 0       # 0 for formalin
    # if(intDiagnosticSlide == 0):
    #     dfClinicalInformation = dfClinicalInformation[['DX' in s[20:22] for s in dfClinicalInformation['folder_id'].tolist()]].reset_index(drop = True)
    # elif(intDiagnosticSlide == 1):
    #     dfClinicalInformation = dfClinicalInformation[['DX' not in s[20:22] for s in dfClinicalInformation['folder_id'].tolist()]].reset_index(drop = True)

    # count = {}
    if len(dfClinicalInformation) > 1:
        # a0_count = len(dfClinicalInformation[dfClinicalInformation['sensitive'] == sensitive[list(sensitive.keys())[0]][0]])
        # a1_count = len(dfClinicalInformation[dfClinicalInformation['sensitive'] == sensitive[list(sensitive.keys())[0]][1]])
        # print(f'{sensitive[list(sensitive.keys())[0]][0]}_count: {a0_count}, {sensitive[list(sensitive.keys())[0]][1]}_count: {a1_count}, total: {a0_count + a1_count}')
        # count = {f'{sensitive[list(sensitive.keys())[0]][0]}_count': a0_count, f'{sensitive[list(sensitive.keys())[0]][1]}_count': a1_count, 'total': a0_count + a1_count}
   
        le = LabelEncoder()
        dfClinicalInformation.label = le.fit_transform(dfClinicalInformation.label.values)
        leLabel = le.classes_

        dictInformation = {}
        dictInformation['label'] = leLabel
        
        dfClinicalInformation = fTransSensitive(sensitive, dfClinicalInformation).reset_index(drop = True)
        dfClinicalInformation.sensitive = le.fit_transform(dfClinicalInformation.sensitive.values)
        leSensitive = le.classes_
        dictInformation['sensitive'] = leSensitive
    return dfClinicalInformation

In [7]:
sensitive = {"gender": ["female", "male"]}
cancers = ["COAD", "READ"]
data = []
for cancer in cancers:
    print(cancer)
    dfClinicalInformationOthers = getGeneticClassification(cancer, sensitive)
    if len(dfClinicalInformationOthers) < 1:
        continue
    else:
        groups = dfClinicalInformationOthers.groupby(['label', 'sensitive'])
        count_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 0].shape[0] 
        count_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 1].shape[0]
        sensitive_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 0].shape[0]
        sensitive_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 1].shape[0]
        print(f'Label 0 count: {count_0}, Label 1 count: {count_1}')
        print(f'Sensitive female count: {sensitive_0}, Sensitive male: {sensitive_1}')
        tmp = {'(0, 0)': 0, '(0, 1)': 0, '(1, 0)': 0, '(1, 1)': 0}
        for group, df in groups:
            print(f'({group[0]}, {group[1]}), count={df.shape[0]}')
            tmp[f'({group[0]}, {group[1]})'] = df.shape[0]
        data.append((cancer, tmp['(0, 0)'], tmp['(0, 1)'], tmp['(1, 0)'], tmp['(1, 1)'], count_0, count_1, sensitive_0, sensitive_1, len(dfClinicalInformationOthers)))
       
        print(f'total: {len(dfClinicalInformationOthers)}')
        # for c in cancer:
        #     ori_path = glob.glob(f'{strDataPath}/*{c}/clinical_information.pkl')[0]
        #     shutil.copy(ori_path, f'./clinical_information/{c}_clinical_information.pkl')
        dfClinicalInformationOthers.to_csv(f'./datasets/gender/geneticClassification/{cancer}_geneticClassification.csv', index=False)

COAD
Label 0 count: 808, Label 1 count: 179
Sensitive female count: 494, Sensitive male: 493
(0, 0), count=390
(0, 1), count=418
(1, 0), count=104
(1, 1), count=75
total: 987
READ
Label 0 count: 335, Label 1 count: 13
Sensitive female count: 173, Sensitive male: 175
(0, 0), count=171
(0, 1), count=164
(1, 0), count=2
(1, 1), count=11
total: 348


In [8]:
sensitive = {"race": ["white", "black or african american"]}
cancers = ["COAD", "READ"]
data = []
for cancer in cancers:
    print(cancer)
    dfClinicalInformationOthers = getGeneticClassification(cancer, sensitive)
    if len(dfClinicalInformationOthers) < 1:
        continue
    else:
        groups = dfClinicalInformationOthers.groupby(['label', 'sensitive'])
        count_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 0].shape[0] 
        count_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 1].shape[0]
        sensitive_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 0].shape[0]
        sensitive_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 1].shape[0]
        print(f'Label 0 count: {count_0}, Label 1 count: {count_1}')
        print(f'Sensitive black count: {sensitive_0}, Sensitive white: {sensitive_1}')
        tmp = {'(0, 0)': 0, '(0, 1)': 0, '(1, 0)': 0, '(1, 1)': 0}
        for group, df in groups:
            print(f'({group[0]}, {group[1]}), count={df.shape[0]}')
            tmp[f'({group[0]}, {group[1]})'] = df.shape[0]
        data.append((cancer, tmp['(0, 0)'], tmp['(0, 1)'], tmp['(1, 0)'], tmp['(1, 1)'], count_0, count_1, sensitive_0, sensitive_1, len(dfClinicalInformationOthers)))
       
        print(f'total: {len(dfClinicalInformationOthers)}')
        # for c in cancer:
        #     ori_path = glob.glob(f'{strDataPath}/*{c}/clinical_information.pkl')[0]
        #     shutil.copy(ori_path, f'./clinical_information/{c}_clinical_information.pkl')
        dfClinicalInformationOthers.to_csv(f'./datasets/race/geneticClassification/{cancer}_geneticClassification.csv', index=False)

COAD
Label 0 count: 542, Label 1 count: 124
Sensitive black count: 134, Sensitive white: 532
(0, 0), count=114
(0, 1), count=428
(1, 0), count=20
(1, 1), count=104
total: 666
READ
Label 0 count: 190, Label 1 count: 8
Sensitive black count: 12, Sensitive white: 186
(0, 0), count=12
(0, 1), count=178
(1, 1), count=8
total: 198
