In [1]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
file_path = 'Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata_preprocessed.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Filter rows where 'PatientOrientationCodeSequence_CodeMeaning' is 'Erect'
df = df[df['PatientOrientationCodeSequence_CodeMeaning'] == 'Erect']

# Further filter rows where 'ViewPosition' is 'PA'
df = df[df['ViewPosition'] == 'PA']

# Save the result to a new CSV file
df.to_csv('Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic_cxr_metadata.csv', index=False)


In [32]:
import pandas as pd

# Paths to the original CSV files
cxr_file_path = 'Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/filtered_mimic_cxr_metadata_with_no_missing_files.csv'
ecg_file_path = 'Z:/tale2/Shared/Mohammod/mimic-iv-ecg-diagnostic-electrocardiogram-matched-subset-1.0/filtered_mimic_ecg_metadata_no_missing_files.csv'

# Read the CSV files
cxr_df = pd.read_csv(cxr_file_path)
ecg_df = pd.read_csv(ecg_file_path)

# Convert 'subject_id' to int (if not already)
cxr_df['subject_id'] = cxr_df['subject_id'].astype(int)
ecg_df['subject_id'] = ecg_df['subject_id'].astype(int)

# Find common 'subject_id's
common_subject_ids = set(cxr_df['subject_id']).intersection(set(ecg_df['subject_id']))

# Filter the DataFrames
filtered_cxr_df = cxr_df[cxr_df['subject_id'].isin(common_subject_ids)]
filtered_ecg_df = ecg_df[ecg_df['subject_id'].isin(common_subject_ids)]

# Save the filtered DataFrames to new CSV files
filtered_cxr_df.to_csv('filtered_mimic_cxr_metadata.csv', index=False)
filtered_ecg_df.to_csv('filtered_mimic_ecg_metadata.csv', index=False)

# Verification step
# Reload the filtered CSVs and check if they have the same unique subject_ids
reloaded_cxr_df = pd.read_csv('filtered_mimic_cxr_metadata.csv')
reloaded_ecg_df = pd.read_csv('filtered_mimic_ecg_metadata.csv')

# Verify if both have the same unique subject_ids
if set(reloaded_cxr_df['subject_id']) == set(reloaded_ecg_df['subject_id']):
    print("Verification successful: Both CSVs have the same unique subject_ids.")
else:
    print("Verification failed: The CSVs do not have the same unique subject_ids.")


Verification successful: Both CSVs have the same unique subject_ids.


In [33]:
import pandas as pd

# Paths to the MIMIC-CXR and MIMIC-ECG CSV files
# Replace these with the actual file paths
cxr_file_path = 'filtered_mimic_cxr_metadata.csv'
ecg_file_path = 'filtered_mimic_ecg_metadata.csv'

# Read the MIMIC-CXR CSV file
cxr_df = pd.read_csv(cxr_file_path)
# Count the unique subject_ids in the MIMIC-CXR file
unique_subjects_cxr = cxr_df['subject_id'].nunique()
print(f"Unique subject_ids in MIMIC-CXR: {unique_subjects_cxr}")

# Read the MIMIC-ECG CSV file
ecg_df = pd.read_csv(ecg_file_path)
# Count the unique subject_ids in the MIMIC-ECG file
unique_subjects_ecg = ecg_df['subject_id'].nunique()
print(f"Unique subject_ids in MIMIC-ECG: {unique_subjects_ecg}")


# Count the number of rows in each DataFrame
num_rows_cxr = cxr_df.shape[0]
num_rows_ecg = ecg_df.shape[0]

print(f"Number of rows in balanced MIMIC-CXR dataset: {num_rows_cxr}")
print(f"Number of rows in balanced MIMIC-ECG dataset: {num_rows_ecg}")


Unique subject_ids in MIMIC-CXR: 26220
Unique subject_ids in MIMIC-ECG: 26220
Number of rows in balanced MIMIC-CXR dataset: 53724
Number of rows in balanced MIMIC-ECG dataset: 239465


In [22]:
import pandas as pd

# Load the CSV file
csv_file = 'filtered_mimic_cxr_metadata.csv'
df = pd.read_csv(csv_file)

# Create the 'file_path' column with prefixes
df['file_path'] = df.apply(lambda row: f"files/p{row['subject_id']}/s{row['study_id']}/{row['dicom_id']}.jpg", axis=1)

# Save the DataFrame back to CSV
output_file = 'Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/filtered_mimic_cxr_metadata_with_path.csv'
df.to_csv(output_file, index=False)

print("CSV file saved with the new 'file_path' column.")


CSV file saved with the new 'file_path' column.


In [24]:
import pandas as pd
import os

# Load the CSV file
csv_file = 'Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/filtered_mimic_cxr_metadata_with_path.csv'
df = pd.read_csv(csv_file)

# Root directory
root = 'Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/'

# Identify indices of rows with missing files
missing_file_indices = df[~df['file_path'].apply(lambda x: os.path.exists(os.path.join(root, x)))].index

# Print the number of missing files
print(f"Number of missing files (rows being removed): {len(missing_file_indices)}")

# Remove rows with missing files
df.drop(missing_file_indices, inplace=True)

# Save the DataFrame to a new CSV file
output_csv_file = 'Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/filtered_mimic_cxr_metadata_with_no_missing_files.csv'
df.to_csv(output_csv_file, index=False)

print("CSV file saved with missing files removed.")


Number of missing files (rows being removed): 946
CSV file saved with missing files removed.


In [25]:
import pandas as pd
import os

# Load the CSV file
csv_file = 'filtered_mimic_ecg_metadata.csv'
df = pd.read_csv(csv_file)

# Root directory
root = 'Z:/tale2/Shared/Mohammod/mimic-iv-ecg-diagnostic-electrocardiogram-matched-subset-1.0'

# Identify indices of rows with missing files
missing_file_indices = df[~df['path'].apply(lambda x: os.path.exists(os.path.join(root, x + '.dat')))].index

# Print the number of missing files
print(f"Number of missing files (rows being removed): {len(missing_file_indices)}")

# Remove rows with missing files
df.drop(missing_file_indices, inplace=True)

# Save the DataFrame to a new CSV file
output_csv_file = 'Z:/tale2/Shared/Mohammod/mimic-iv-ecg-diagnostic-electrocardiogram-matched-subset-1.0/filtered_mimic_ecg_metadata_no_missing_files.csv'
df.to_csv(output_csv_file, index=False)

print("CSV file saved with missing files removed.")


Number of missing files (rows being removed): 74148
CSV file saved with missing files removed.


In [None]:
import pandas as pd
import os

# Load the CSV file
csv_file = 'Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/filtered_mimic_cxr_metadata_with_no_missing_files.csv'
df = pd.read_csv(csv_file)

# Root directory
root = 'Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/'

# Count the number of files that are not found
missing_files_count = 0

# Check each file
for file_path in df['path']:
    # Append the .dat extension to the file path
    full_path = os.path.join(root, file_path)
    if not os.path.exists(full_path):
        missing_files_count += 1

print(f"Number of missing files: {missing_files_count}")


In [34]:
import pandas as pd

# Load the datasets
cxr_file_path = 'filtered_mimic_cxr_metadata.csv'
ecg_file_path = 'filtered_mimic_ecg_metadata.csv'

cxr_df = pd.read_csv(cxr_file_path)
ecg_df = pd.read_csv(ecg_file_path)

# Ensure 'subject_id' is int
cxr_df['subject_id'] = cxr_df['subject_id'].astype(int)
ecg_df['subject_id'] = ecg_df['subject_id'].astype(int)

# Getting unique subject_ids
subject_ids = cxr_df['subject_id'].unique()

# Lists to store balanced data
balanced_cxr_data = []
balanced_ecg_data = []

for subject_id in subject_ids:
    # Filter rows for the current subject_id
    cxr_rows = cxr_df[cxr_df['subject_id'] == subject_id]
    ecg_rows = ecg_df[ecg_df['subject_id'] == subject_id]

    # Find the minimum count
    min_count = min(len(cxr_rows), len(ecg_rows))

    # Sample min_count rows from each DataFrame
    balanced_cxr_rows = cxr_rows.sample(n=min_count, random_state=1)
    balanced_ecg_rows = ecg_rows.sample(n=min_count, random_state=1)

    # Append to the lists
    balanced_cxr_data.append(balanced_cxr_rows)
    balanced_ecg_data.append(balanced_ecg_rows)

# Concatenate the lists into new DataFrames
balanced_cxr_df = pd.concat(balanced_cxr_data, ignore_index=True)
balanced_ecg_df = pd.concat(balanced_ecg_data, ignore_index=True)


# Save the balanced DataFrames
balanced_cxr_df.to_csv('Z:/tale2/Shared/Mohammod/physionet.org/files/mimic-cxr-jpg/2.0.0/final_mimic_cxr_metadata.csv', index=False)
balanced_ecg_df.to_csv('Z:/tale2/Shared/Mohammod/mimic-iv-ecg-diagnostic-electrocardiogram-matched-subset-1.0/final_mimic_ecg_metadata.csv', index=False)

# Count the number of rows in each DataFrame
num_rows_cxr = balanced_cxr_df.shape[0]
num_rows_ecg = balanced_ecg_df.shape[0]

print(f"Number of rows in balanced MIMIC-CXR dataset: {num_rows_cxr}")
print(f"Number of rows in balanced MIMIC-ECG dataset: {num_rows_ecg}")

print("Balancing complete. Balanced datasets saved.")


Number of rows in balanced MIMIC-CXR dataset: 50981
Number of rows in balanced MIMIC-ECG dataset: 50981
Balancing complete. Balanced datasets saved.
