# Device Dataset

In this notebook we contrive the device dataset from cheXpert dataset. 

First you should download the dataset from <a href="https://www.kaggle.com/datasets/willarevalo/chexpert-v10-small">here</a>.

In [1]:
import os
import sys
project_root= '../'
sys.path.append(project_root)

In [3]:
import numpy as np
import pandas as pd
import random
import os
import shutil
import csv
from sklearn.model_selection import train_test_split
from PIL import Image
import matplotlib.pyplot as plt

In [4]:
train_df = pd.read_csv('/usr/local/faststorage/datasets/chexpert/train.csv')

train_df = train_df[train_df['Frontal/Lateral']=='Frontal' ]
# subsitute na with 0
train_df = train_df.fillna(0)
train_df = train_df.reset_index(drop=True)

We check the statistics of 4 subgroups before contriving.

In [7]:
disease = 'Pleural Effusion'
disease2 = 'No Finding'
g0 = train_df.query(f'`{disease}` == 1 & `Support Devices` == 1')
g1 = train_df.query(f'`{disease2}` == 1 & `Support Devices` == 1')
g2 = train_df.query(f'`{disease}` == 1 & `Support Devices` == 0')
g3 = train_df.query(f'`{disease2}` == 1 & `Support Devices` == 0')
print(len(g0), len(g1), len(g2), len(g3))

49917 7421 26643 9474


We further filter out the values and add helper columns to the csv file

In [9]:
import pandas as pd

md_path = '../datasets/device/info_md.csv'
# Create parent directories for the file if they do not exist
os.makedirs(os.path.dirname(md_path), exist_ok=True)
# Load the original CSV file
chexpert_data = train_df

# Define a function to determine the health status with the updated criteria
def determine_health_status_final(row):
    if row['No Finding'] == 1.0 and row['Frontal/Lateral'] == 'Frontal':
        return 1  # Healthy
    elif row['Pleural Effusion'] == 1.0 and row['Frontal/Lateral'] == 'Frontal':
        return -1  # Unhealthy
    else:
        return None  # None for other cases


chexpert_data['Healthy/Unhealthy'] = chexpert_data.apply(determine_health_status_final, axis=1)

# Drop rows where 'Healthy/Unhealthy' is None
filtered_data_final = chexpert_data.dropna(subset=['Healthy/Unhealthy'])

# Selecting the specified columns
filtered_data_final = filtered_data_final[['Path', 'Sex', 'Age', 'Frontal/Lateral', 'AP/PA', 'Support Devices','Healthy/Unhealthy']]

# Save the new DataFrame to a CSV file
output_file_path_final = md_path # Replace with your desired output file path
filtered_data_final.to_csv(output_file_path_final, index=False)

# Print completion message
print("CSV file has been created at:", output_file_path_final)

CSV file has been created at: ../datasets/device/info_md.csv


# Grouping

Here we define our groupings:

- group 0: This group denotes the subjects with Pleural Effusion and support devices. (Contains 90% of the unhealthy subjects)
- group 1: This group denotes the subjects with No Finding and support devices. (Contains 10% of the healthy subjects)
- group 2: This group denotes the subjects with Pleural Effusion and No support devices. (Contains 10% of the unhealthy subjects)
--group 3: This group denotes the subjects with No Finding and No support devices. (Contains 90% of the healthy subjects)

In [10]:
import pandas as pd
import numpy as np

# Load your CSV file
file_path = md_path  # Replace with the path to your filtered CSV file
data = pd.read_csv(file_path)

# Initialize a new 'group' column
data['group'] = np.nan

# Set a random seed for reproducibility
np.random.seed(0)

# Assign groups based on health status and random chance
for index, row in data.iterrows():
    # if the row is unhealthy and does not have support devices
    if row['Healthy/Unhealthy'] == -1 and row['Support Devices'] == 1:
        data.at[index, 'group'] = 0
    elif row['Healthy/Unhealthy'] == 1 and row['Support Devices'] == 1:
         data.at[index, 'group'] = 1
    elif row['Healthy/Unhealthy'] == -1 and row['Support Devices'] == 0:
        data.at[index, 'group'] = 2
    elif row['Healthy/Unhealthy'] == 1 and row['Support Devices'] == 0:
        data.at[index, 'group'] = 3
    # else drop the row
    else:
        data.drop(index, inplace=True)
# Save the updated DataFrame to a new CSV file
updated_csv_path = md_path  # Replace with your desired output file path
data.to_csv(updated_csv_path, index=False)

print("Updated CSV file saved to:", updated_csv_path)


Updated CSV file saved to: ../datasets/device/info_md.csv


Here we modify the path column. There is not any specific reason for this just a personal choice of the author.

In [11]:
import pandas as pd

# Load your CSV file
file_path = md_path # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Function to modify the 'Path' values
def modify_path(path):
    parts = path.split('/')
    new_path = '_'.join(parts[1:])  # Join parts excluding the first element (usually the dataset name)
    return new_path

# Apply the function to the 'Path' column
data['Path'] = data['Path'].apply(modify_path)

# Save the updated DataFrame to a new CSV file
updated_csv_path = md_path  # Replace with your desired output file path
data.to_csv(updated_csv_path, index=False)

print("Updated CSV file saved to:", updated_csv_path)


Updated CSV file saved to: ../datasets/device/info_md.csv


In [12]:
import pandas as pd
import numpy as np

# Load your CSV file
csv_file_path = md_path  # Replace with your CSV file path
data = pd.read_csv(csv_file_path)

# Set a random seed for reproducibility
np.random.seed(0)

# Define the partition sizes
train_size = 0.7
val_size = 0.15
# Test size is implicitly defined as the remaining percentage

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Calculate the number of samples for each partition
num_samples = len(data)
num_train = int(train_size * num_samples)
num_val = int(val_size * num_samples)

# Assign partitions
data['partition'] = 2  # Default to test
data.iloc[:num_train, data.columns.get_loc('partition')] = 0  # Train
data.iloc[num_train:num_train + num_val, data.columns.get_loc('partition')] = 1  # Validation

# Save the updated DataFrame to a new CSV file
updated_csv_path = md_path   # Replace with your desired output file path
data.to_csv(updated_csv_path, index=False)

print("Updated CSV file with partitions saved to:", updated_csv_path)


Updated CSV file with partitions saved to: ../datasets/device/info_md.csv


# Statistics

Calculate the number of samples in each group.

In [13]:
# calculate number of images in each group
import pandas as pd
import numpy as np

# Load your CSV file
csv_file_path = md_path  # Replace with your CSV file path
data = pd.read_csv(csv_file_path)

# Calculate the number of samples for each group
num_samples = len(data)
num_healthy = len(data[data['Healthy/Unhealthy'] == 1])
num_unhealthy = len(data[data['Healthy/Unhealthy'] == -1])

# Print the results
print(f"Total number of samples: {num_samples}")
print(f"Number of healthy samples: {num_healthy}")
print(f"Number of unhealthy samples: {num_unhealthy}")


# calculate number of images in each `group`

group_0 = len(data[data['group'] == 0])
group_1 = len(data[data['group'] == 1])
group_2 = len(data[data['group'] == 2])
group_3 = len(data[data['group'] == 3])

# Print the results
print(f"Number of samples in group 0: {group_0}")
print(f"Number of samples in group 1: {group_1}")
print(f"Number of samples in group 2: {group_2}")
print(f"Number of samples in group 3: {group_3}")




Total number of samples: 93455
Number of healthy samples: 16895
Number of unhealthy samples: 76560
Number of samples in group 0: 49917
Number of samples in group 1: 7421
Number of samples in group 2: 26643
Number of samples in group 3: 9474


# Note:

The number of samples in the csv file has not been controled yet. We force the imbalancy in the dataset class later. 