# Set up

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf

seed = 2021
os.environ['PYTHONHASHSEED']=str(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

# Chi Square test

## 2D CXR

In [None]:
Labels_diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

def get_data_chi2test(diseases, types):
    if (types == 'race'):
        df = [[0, 0], [0, 0], [0, 0]]
    elif (types == 'gender'):
        df = [[0, 0], [0, 0]] 
    elif (types == 'age'):
        df = [[0, 0], [0, 0], [0, 0], [0, 0]]
    else:
        print('No match category')
        return

    filename = ['data/mimic_train.tfrecords', 'data/mimic_test.tfrecords', 'data/mimic_val.tfrecords'] 

#     filename = ['data/Chexpert_train.tfrecords', 'data/Chexpert_test.tfrecords', 'data/Chexpert_val.tfrecords'] 
        
    raw_dataset = tf.data.TFRecordDataset(filename)
    for raw_record in raw_dataset:

        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())
        
        if (example.features.feature[diseases].float_list.value[0] == 1):
            label = 1
        else:
            label = 0
            
        if (types == 'race'):
            race = example.features.feature['race'].int64_list.value[0]
            if (race == 4):
                race = 2
            df[race][label] += 1  
        elif (types == 'gender'):
            gender = example.features.feature['gender'].int64_list.value[0]
            df[gender][label] += 1
        else:
            age = example.features.feature['age'].int64_list.value[0]
            if (age > 1):
                age -= 1
            df[age][label] += 1
             
    return np.array(df)

In [None]:
from scipy.stats import chi2_contingency, chisquare

for i in Labels_diseases:
    obs = get_data_chi2test(i, 'age')
    
    chi2, p, dof, ex = chi2_contingency(obs)

    (chi2, p) == chisquare(obs.ravel(), f_exp=ex.ravel(),
                                 ddof=obs.size - 1 - dof)
    print(i, p)

In [None]:
from scipy.stats import chi2_contingency, chisquare

for i in Labels_diseases:
    obs = get_data_chi2test(i, 'gender')

    chi2, p, dof, ex = chi2_contingency(obs)

    (chi2, p) == chisquare(obs.ravel(), f_exp=ex.ravel(),
                                 ddof=obs.size - 1 - dof)
    print(i, p)

In [None]:
from scipy.stats import chi2_contingency, chisquare

for i in Labels_diseases:
    obs = get_data_chi2test(i, 'race')

    chi2, p, dof, ex = chi2_contingency(obs)

    (chi2, p) == chisquare(obs.ravel(), f_exp=ex.ravel(),
                                 ddof=obs.size - 1 - dof)
    print(i, p)

## 3D MRI

In [None]:
df = pd.read_csv('data_new.csv')
data_path = '../../../mnt/usb/kuopc/ADNI_B1/MPR__GradWarp__B1_Correction_crop/'

df = df.loc[df['Group'] != 'MCI']

df['Group'] = df['Group'].replace(['CN', 'AD'], [0, 1])
df['Sex'] = df['Sex'].replace(['F', 'M'], [0, 1])
df['Age'] = np.where(df['Age'] <= 75, 0, 1)
df['Race'] = np.where(df['Race'] < 1, 0, 1)

In [None]:
from scipy.stats import chi2_contingency, chisquare

obs = np.array([df.loc[df['Sex'] == 0, 'Group'].value_counts().values, df.loc[df['Sex'] == 1, 'Group'].value_counts().values])

chi2, p, dof, ex = chi2_contingency(obs)

(chi2, p) == chisquare(obs.ravel(), f_exp=ex.ravel(),
                             ddof=obs.size - 1 - dof)
print(p)

In [None]:
from scipy.stats import chi2_contingency, chisquare

obs = np.array([df.loc[df['Age'] == 0, 'Group'].value_counts().values, df.loc[df['Age'] == 1, 'Group'].value_counts().values])

chi2, p, dof, ex = chi2_contingency(obs)

(chi2, p) == chisquare(obs.ravel(), f_exp=ex.ravel(),
                             ddof=obs.size - 1 - dof)
print(p)

# Permutation test

## 2D CXR

In [None]:
def get_data_permutaitontest(data='test', types='race'):
    x = []
    y = []

    if (data=='train'):
        filename = ['data/mimic_train.tfrecords']
#         filename = ['data/Chexpert_train.tfrecords']
    else:
        filename = ['data/mimic_test.tfrecords']
#         filename = ['data/Chexpert_test.tfrecords']


    raw_dataset = tf.data.TFRecordDataset(filename)
    for raw_record in raw_dataset:

        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())

        race = example.features.feature['race'].int64_list.value[0]
        age = example.features.feature['age'].int64_list.value[0]
        gender = example.features.feature['gender'].int64_list.value[0]

        label = []
        label.append(1 if example.features.feature['Atelectasis'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['Cardiomegaly'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['Consolidation'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['Edema'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['Enlarged Cardiomediastinum'].float_list.value[0] == 1 else 0)
#         label.append(1 if example.features.feature['Fracture'].float_list.value[0] == 1 else 0)
#         label.append(1 if example.features.feature['Lung Lesion'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['Lung Opacity'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['No Finding'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['Pleural Effusion'].float_list.value[0] == 1 else 0)
#         label.append(1 if example.features.feature['Pleural Other'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['Pneumonia'].float_list.value[0] == 1 else 0)
        label.append(1 if example.features.feature['Pneumothorax'].float_list.value[0] == 1 else 0)
#         label.append(1 if example.features.feature['Support Devices'].float_list.value[0] == 1 else 0)

        if (types == 'race'):
            if (race == 0):
                x.append([1, 0, 0])
            elif (race == 1):
                x.append([0, 1, 0])
            elif (race == 4):
                x.append([0, 0, 1])
            else:
                continue
        elif (types == 'age'):
            if (age == 0 or age == 1):
                x.append([1, 0, 0, 0])
            elif (age == 2):
                x.append([0, 1, 0, 0])
            elif (age == 3):
                x.append([0, 0, 1, 0])
            elif (age == 4):
                x.append([0, 0, 0, 1])
            else:
                continue
#         elif (types == 'age'):
#             if (age == 0):
#                 x.append([1, 0, 0, 0])
#             elif (age == 1):
#                 x.append([0, 1, 0, 0])
#             elif (age == 2):
#                 x.append([0, 0, 1, 0])
#             elif (age == 3):
#                 x.append([0, 0, 0, 1])
#             else:
#                 continue
        elif (types == 'gender'):
            if (gender == 0):
                x.append([1, 0])
            elif (gender == 1):
                x.append([0, 1])
            else:
                continue
        else:
            continue

        y.append(label)

    return np.array(x), np.array(y)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

Labels_diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pneumonia', 'Pneumothorax']

x_train, y_train = get_data_permutaitontest('train', 'race')

clf = OneVsRestClassifier(LogisticRegression(random_state=2021)).fit(x_train, y_train)

x_test, y_test = get_data_permutaitontest('test', 'race')

preds = clf.predict_proba(x_test)

num = []
for i in range(10):
    num.append(np.unique(preds[:, i]))

for j in range(len(preds)):
    for i in range(10):
        if (preds[j][i] >= np.median(num[i])):
            preds[j][i] = 1
        else:
            preds[j][i] = 0

result_race = classification_report(y_test, preds, output_dict=True)

print(classification_report(y_test, preds))

In [None]:
Labels_diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

x_train, y_train = get_data_permutaitontest('train', 'age')

clf = OneVsRestClassifier(LogisticRegression(random_state=2021)).fit(x_train, y_train)

x_test, y_test = get_data_permutaitontest('test', 'age')

preds = clf.predict_proba(x_test)

num = []
for i in range(10):
    num.append(np.unique(preds[:, i]))

for j in range(len(preds)):
    for i in range(10):
        if (preds[j][i] >= np.median(num[i])):
            preds[j][i] = 1
        else:
            preds[j][i] = 0

result_age = classification_report(y_test, preds, output_dict=True)

print(classification_report(y_test, preds))

In [None]:
Labels_diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

x_train, y_train = get_data_permutaitontest('train', 'gender')

clf = OneVsRestClassifier(LogisticRegression(random_state=2021)).fit(x_train, y_train)

x_test, y_test = get_data_permutaitontest('test', 'gender')

preds = clf.predict_proba(x_test)

num = []
for i in range(10):
    num.append(np.unique(preds[:, i]))

for j in range(len(preds)):
    for i in range(10):
        if (preds[j][i] >= np.median(num[i])):
            preds[j][i] = 1
        else:
            preds[j][i] = 0

result_gender = classification_report(y_test, preds, output_dict=True)

print(classification_report(y_test, preds))

In [None]:
from sklearn.metrics import f1_score
np.random.seed(2021)
x_test, y_test = get_data_permutaitontest('test')
length = len(y_test)
a = []
for i in range(100000):
    guess = np.copy(y_test)
    np.random.shuffle(guess)

    a.append(classification_report(y_test, guess, output_dict=True))

In [None]:
plt.figure(figsize=(6, 4), dpi=250)

b = list(map(lambda x: x['weighted avg']['f1-score'], a))
c = plt.hist(b, bins=100, color='limegreen')
plt.axvline(np.percentile(b, 95), color='r', linestyle='dashed', linewidth=1, label='95 percentile')
plt.axvline(result_race['weighted avg']['f1-score'], color='g', linestyle='dashed', linewidth=1, label='Race')
plt.axvline(result_age['weighted avg']['f1-score'], color='b', linestyle='dashed', linewidth=1, label='Age')
plt.axvline(result_gender['weighted avg']['f1-score'], color='y', linestyle='dashed', linewidth=1, label='Gender')
plt.legend(bbox_to_anchor=(1, 0.7))
plt.show()

In [None]:
plt.figure(figsize=(6, 4), dpi=250)

b = list(map(lambda x: x['weighted avg']['f1-score'], a))
c = plt.hist(b, bins=100, color='limegreen')
plt.axvline(np.percentile(b, 95), color='r', linestyle='dashed', linewidth=1, label='95 percentile')
plt.axvline(result_race['weighted avg']['f1-score'], color='g', linestyle='dashed', linewidth=1, label='Race')
plt.axvline(result_age['weighted avg']['f1-score'], color='b', linestyle='dashed', linewidth=1, label='Age')
plt.axvline(result_gender['weighted avg']['f1-score'], color='y', linestyle='dashed', linewidth=1, label='Gender')
plt.legend(bbox_to_anchor=(1, 0.7))
plt.show()

In [None]:
import matplotlib.pyplot as plt

for i in list(a[0].keys()):
    print(i)
    b = list(map(lambda x: x[i]['f1-score'], a))
    c = plt.hist(b, bins=100, color='limegreen')
    plt.axvline(np.percentile(b, 95), color='r', linestyle='dashed', linewidth=1)
    plt.axvline(result_race['weighted avg']['f1-score'], color='g', linestyle='dashed', linewidth=1)
    plt.axvline(result_age['weighted avg']['f1-score'], color='b', linestyle='dashed', linewidth=1)
    plt.axvline(result_gender['weighted avg']['f1-score'], color='y', linestyle='dashed', linewidth=1)
    plt.show()

## 3D MRI

In [None]:
df = pd.read_csv('data_new.csv')
data_path = '../../../mnt/usb/kuopc/ADNI_B1/MPR__GradWarp__B1_Correction_crop/'

df = df.loc[df['Group'] != 'MCI']

df['Group'] = df['Group'].replace(['CN', 'AD'], [0, 1])
df['Sex'] = df['Sex'].replace(['F', 'M'], [0, 1])
df['Age'] = np.where(df['Age'] <= 75, 0, 1)
df['Race'] = np.where(df['Race'] < 1, 0, 1)

In [None]:
from sklearn.metrics import classification_report

df_train = df.loc[df['Split'] == 'train']

df_test = df.loc[df['Split'] == 'test']

preds = df_test['Sex'].values

result_sex = classification_report(df_test['Group'].values, preds, output_dict=True)

print(classification_report(df_test['Group'].values, preds))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

df_train = df.loc[df['Split'] == 'train']

df_test = df.loc[df['Split'] == 'test']

preds = df_test['Age'].values

result_age = classification_report(df_test['Group'].values, preds, output_dict=True)

print(classification_report(df_test['Group'].values, preds))

In [None]:
from sklearn.metrics import f1_score
np.random.seed(2021)

a = []
for i in range(100000):
    guess = np.copy(df_test['Group'].values)
    np.random.shuffle(guess)

    a.append(classification_report(df_test['Group'].values, guess, output_dict=True))
    
    

In [None]:
import matplotlib.pyplot as plt

for i in list(a[0].keys()):
    if (i == 'accuracy'):
        continue
    print(i)
    b = list(map(lambda x: x[i]['f1-score'], a))
    c = plt.hist(b, bins=100, color='limegreen')
    plt.axvline(np.percentile(b, 95), color='r', linestyle='dashed', linewidth=1)
    plt.axvline(result_age['weighted avg']['f1-score'], color='b', linestyle='dashed', linewidth=1)
    plt.axvline(result_sex['weighted avg']['f1-score'], color='y', linestyle='dashed', linewidth=1)
    plt.show()