In [2]:
import numpy as np
import pandas as pd
import glob
from sklearn.preprocessing import LabelEncoder
import os
import shutil

In [4]:
strClinicalInformationPath = "./tcga_pan_cancer/"
strEmbeddingPath = "./../Fairness-Foundation/AI_fairness/"
strDataPath = "./../nas/TCGA/Preprocessing/"

sensitive = {"Race Category": ["White", "Black or African American"]}

In [3]:
def fClinicalInformation(cancer, geneType, geneName):
    df = pd.DataFrame({})
    part = pd.read_csv(glob.glob(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/clinical_data.tsv')[0], sep='\t')
    df = pd.concat([df, part], ignore_index=True)
    label = pd.read_csv(glob.glob(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/*/{geneType}_{geneName}*/*.csv')[0])
    label_filter = label[['Patient ID', 'Altered']]
    merge_df = pd.merge(df, label_filter, on="Patient ID")
    merge_df.rename(columns={'Patient ID': 'case_submitter_id', 'Altered': 'label'}, inplace=True)
    return merge_df

In [4]:
dfClinicalInformation = (fClinicalInformation("acc", "Common Genes", "CTNNB1"))
print(dfClinicalInformation['Race Category'].unique())

['White' 'Black or African American' nan 'Asian']


In [44]:
def fReduceDataFrame(cancer, df, sensitive):
    df = df[['case_submitter_id', list(sensitive.keys())[0], 'label']]
    df.columns = ['case_submitter_id', 'sensitive', 'label']
    df = df.dropna(subset=['sensitive'])
    return df

In [41]:
def fTransSensitive(sensitive, df):
    substrings = sensitive[list(sensitive.keys())[0]]
    df = df[[any(x in y for x in substrings) for y in df['sensitive'].tolist()]]
    return df

In [46]:
def getGeneticClassification(cancerType, sensitive, geneType, geneName):
    dfClinicalInformation = fClinicalInformation(cancerType, geneType, geneName)
    dfClinicalInformation = fReduceDataFrame(cancerType, dfClinicalInformation.drop_duplicates(subset = 'case_submitter_id', ignore_index = True), sensitive)
    
    lsDownloadPath = glob.glob(f'{strEmbeddingPath}/*.pt')
    lsDownloadFoldID = [s.split('/')[-1][:-3] for s in lsDownloadPath]

    lsDownloadCaseSubmitterId = [s[:12] for s in lsDownloadFoldID]
    dfDownload = pd.DataFrame({
                'case_submitter_id': lsDownloadCaseSubmitterId,
                'folder_id': lsDownloadFoldID
            })
    dfClinicalInformation = pd.merge(dfClinicalInformation, dfDownload, on = "case_submitter_id")

    if len(dfClinicalInformation) > 1:
        le = LabelEncoder()
        dfClinicalInformation.label = le.fit_transform(dfClinicalInformation.label.values)
        leLabel = le.classes_

        dictInformation = {}
        dictInformation['label'] = leLabel
        
        dfClinicalInformation = fTransSensitive(sensitive, dfClinicalInformation).reset_index(drop = True)
        dfClinicalInformation.sensitive = le.fit_transform(dfClinicalInformation.sensitive.values)
        leSensitive = le.classes_
        dictInformation['sensitive'] = leSensitive
        
    return dfClinicalInformation

In [51]:
sensitive = {"Race Category": ["White", "Black or African American"]}
for cancers in os.listdir(strClinicalInformationPath):
    cancer = cancers.split('_')[0]
    if os.path.isdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/'):
        for types in os.listdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/'):
            if types == 'Common Genes':
                geneType = 'Common Genes'
            elif types == 'Targeted Drugs for Genes':
                geneType = 'Mutated Genes'
            if os.path.isdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/{types}/'):
                for name in os.listdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/{types}/'):
                    geneName = name.split('-')[0].split('_')[1]
                    dfClinicalInformationOthers = getGeneticClassification(cancer, sensitive, geneType, geneName)
                    if len(dfClinicalInformationOthers) < 1:
                        continue
                    else:
                        print(f'cancer: {cancer}, geneType: {geneType}, geneName: {geneName}')
                        groups = dfClinicalInformationOthers.groupby(['label', 'sensitive'])
                        count_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 0].shape[0] 
                        count_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 1].shape[0]
                        sensitive_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 0].shape[0]
                        sensitive_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 1].shape[0]
                        print(f'Label 0 count: {count_0}, Label 1 count: {count_1}')
                        print(f'Sensitive 0 count: {sensitive_0}, Sensitive 1: {sensitive_1}')
                        tmp = {'(0, 0)': 0, '(0, 1)': 0, '(1, 0)': 0, '(1, 1)': 0}
                        for group, df in groups:
                            print(f'({group[0]}, {group[1]}), count={df.shape[0]}')
                            tmp[f'({group[0]}, {group[1]})'] = df.shape[0]

                        print(f'total: {len(dfClinicalInformationOthers)}')
                        dfClinicalInformationOthers.to_csv(f'./datasets/race/geneticClassification/{cancer}_{geneType}_{geneName}.csv', index=False)

cancer: lusc, geneType: Mutated Genes, geneName: RET
Label 0 count: 1093, Label 1 count: 36
Sensitive 0 count: 89, Sensitive 1: 1040
(0, 0), count=89
(0, 1), count=1004
(1, 1), count=36
total: 1129
cancer: lusc, geneType: Mutated Genes, geneName: BRAF
Label 0 count: 1089, Label 1 count: 40
Sensitive 0 count: 89, Sensitive 1: 1040
(0, 0), count=86
(0, 1), count=1003
(1, 0), count=3
(1, 1), count=37
total: 1129
cancer: lusc, geneType: Mutated Genes, geneName: ALK
Label 0 count: 1087, Label 1 count: 42
Sensitive 0 count: 89, Sensitive 1: 1040
(0, 0), count=83
(0, 1), count=1004
(1, 0), count=6
(1, 1), count=36
total: 1129
cancer: lusc, geneType: Mutated Genes, geneName: MET
Label 0 count: 1105, Label 1 count: 24
Sensitive 0 count: 89, Sensitive 1: 1040
(0, 0), count=89
(0, 1), count=1016
(1, 1), count=24
total: 1129
cancer: lusc, geneType: Mutated Genes, geneName: EGFR
Label 0 count: 1031, Label 1 count: 98
Sensitive 0 count: 89, Sensitive 1: 1040
(0, 0), count=80
(0, 1), count=951
(1, 0)

In [53]:
sensitive = {"Sex": ["Female", "Male"]}
for cancers in os.listdir(strClinicalInformationPath):
    cancer = cancers.split('_')[0]
    if os.path.isdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/'):
        for types in os.listdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/'):
            if types == 'Common Genes':
                geneType = 'Common Genes'
            elif types == 'Targeted Drugs for Genes':
                geneType = 'Mutated Genes'
            if os.path.isdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/{types}/'):
                for name in os.listdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/{types}/'):
                    geneName = name.split('-')[0].split('_')[1]
                    dfClinicalInformationOthers = getGeneticClassification(cancer, sensitive, geneType, geneName)
                    if len(dfClinicalInformationOthers) < 1:
                        continue
                    else:
                        print(f'cancer: {cancer}, geneType: {geneType}, geneName: {geneName}')
                        groups = dfClinicalInformationOthers.groupby(['label', 'sensitive'])
                        count_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 0].shape[0] 
                        count_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 1].shape[0]
                        sensitive_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 0].shape[0]
                        sensitive_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 1].shape[0]
                        print(f'Label 0 count: {count_0}, Label 1 count: {count_1}')
                        print(f'Sensitive 0 count: {sensitive_0}, Sensitive 1: {sensitive_1}')
                        tmp = {'(0, 0)': 0, '(0, 1)': 0, '(1, 0)': 0, '(1, 1)': 0}
                        for group, df in groups:
                            print(f'({group[0]}, {group[1]}), count={df.shape[0]}')
                            tmp[f'({group[0]}, {group[1]})'] = df.shape[0]

                        print(f'total: {len(dfClinicalInformationOthers)}')
                        dfClinicalInformationOthers.to_csv(f'./datasets/gender/geneticClassification/{cancer}_{geneType}_{geneName}.csv', index=False)

cancer: lusc, geneType: Mutated Genes, geneName: RET
Label 0 count: 1496, Label 1 count: 54
Sensitive 0 count: 406, Sensitive 1: 1144
(0, 0), count=396
(0, 1), count=1100
(1, 0), count=10
(1, 1), count=44
total: 1550
cancer: lusc, geneType: Mutated Genes, geneName: BRAF
Label 0 count: 1497, Label 1 count: 53
Sensitive 0 count: 406, Sensitive 1: 1144
(0, 0), count=384
(0, 1), count=1113
(1, 0), count=22
(1, 1), count=31
total: 1550
cancer: lusc, geneType: Mutated Genes, geneName: ALK
Label 0 count: 1478, Label 1 count: 72
Sensitive 0 count: 406, Sensitive 1: 1144
(0, 0), count=391
(0, 1), count=1087
(1, 0), count=15
(1, 1), count=57
total: 1550
cancer: lusc, geneType: Mutated Genes, geneName: MET
Label 0 count: 1492, Label 1 count: 58
Sensitive 0 count: 406, Sensitive 1: 1144
(0, 0), count=384
(0, 1), count=1108
(1, 0), count=22
(1, 1), count=36
total: 1550
cancer: lusc, geneType: Mutated Genes, geneName: EGFR
Label 0 count: 1405, Label 1 count: 145
Sensitive 0 count: 406, Sensitive 1: 

In [7]:
sensitive = {"Race Category": ["White", "non-White"]}
for cancers in os.listdir(strClinicalInformationPath):
    cancer = cancers.split('_')[0]
    if os.path.isdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/'):
        for types in os.listdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/'):
            if types == 'Common Genes':
                geneType = 'Common Genes'
            elif types == 'Targeted Drugs for Genes':
                geneType = 'Mutated Genes'
            if os.path.isdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/{types}/'):
                for name in os.listdir(f'{strClinicalInformationPath}{cancer}_tcga_pan_can_atlas_2018/{types}/'):
                    geneName = name.split('-')[0].split('_')[1]
                    dfClinicalInformationOthers = getGeneticClassification(cancer, sensitive, geneType, geneName)
                    if len(dfClinicalInformationOthers) < 1:
                        continue
                    else:
                        print(f'cancer: {cancer}, geneType: {geneType}, geneName: {geneName}')
                        groups = dfClinicalInformationOthers.groupby(['label', 'sensitive'])
                        count_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 0].shape[0] 
                        count_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['label'] == 1].shape[0]
                        sensitive_0 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 0].shape[0]
                        sensitive_1 = dfClinicalInformationOthers[dfClinicalInformationOthers['sensitive'] == 1].shape[0]
                        print(f'Label 0 count: {count_0}, Label 1 count: {count_1}')
                        print(f'Sensitive 0 count: {sensitive_0}, Sensitive 1: {sensitive_1}')
                        tmp = {'(0, 0)': 0, '(0, 1)': 0, '(1, 0)': 0, '(1, 1)': 0}
                        for group, df in groups:
                            print(f'({group[0]}, {group[1]}), count={df.shape[0]}')
                            tmp[f'({group[0]}, {group[1]})'] = df.shape[0]

                        print(f'total: {len(dfClinicalInformationOthers)}')
                        dfClinicalInformationOthers.to_csv(f'./datasets/race/geneticClassification/{cancer}_{geneType}_{geneName}.csv', index=False)

AttributeError: 'str' object has no attribute 'shape'