# 1.Statistics for DataSet

In [7]:
import os
import re
from collections import defaultdict
import pandas as pd
from IPython.display import display

# Specify the target folder path
folder_path = "../Processed/PII_GZ"

# Define dictionaries to store the statistics
week_count = defaultdict(int)  # Count of seg.nii.gz files per week
total_seg_count = 0  # Total count of seg.nii.gz files

# Iterate through all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith("seg.nii.gz"):
        # Extract the week information from WKXX
        match = re.search(r'WK(\d{2,3})_seg\.nii\.gz', file_name)
        if match:
            week = int(match.group(1))  # Convert week to integer
            week_count[week] += 1
            total_seg_count += 1

# Define pregnancy stages
pregnancy_stages = {
    "Early Pregnancy (Week: 0-19)": range(0, 20),  # 0-19 weeks
    "Mid Pregnancy (Week: 20-30)": range(20, 31),  # 20-30 weeks
    "Late Pregnancy (Week 31-40)": range(31, 41),  # 31-40 weeks
}

# Initialize counts for each pregnancy stage
stage_counts = {stage: 0 for stage in pregnancy_stages}

# Aggregate counts into stages
for week, count in week_count.items():
    for stage, weeks in pregnancy_stages.items():
        if week in weeks:
            stage_counts[stage] += count
            break

# Create the week-by-week DataFrame
week_data = {"Week": [], "Number of seg.nii.gz Files": []}
for week, count in sorted(week_count.items()):
    week_data["Week"].append(f"WK{week}")
    week_data["Number of seg.nii.gz Files"].append(count)

# Add the total count
week_data["Week"].append("Total")
week_data["Number of seg.nii.gz Files"].append(total_seg_count)

week_df = pd.DataFrame(week_data)

# Create the pregnancy stage DataFrame
stage_data = {"Pregnancy Stage": [], "Number of seg.nii.gz Files": []}
for stage, count in stage_counts.items():
    stage_data["Pregnancy Stage"].append(stage)
    stage_data["Number of seg.nii.gz Files"].append(count)

# Add total for pregnancy stage
stage_data["Pregnancy Stage"].append("Total")
stage_data["Number of seg.nii.gz Files"].append(sum(stage_counts.values()))

stage_df = pd.DataFrame(stage_data)

# Display the two tables separately
print("Week-by-Week Statistics:")
display(week_df)

print("\nPregnancy Stage Statistics:")
display(stage_df)

Week-by-Week Statistics:


Unnamed: 0,Week,Number of seg.nii.gz Files
0,WK12,35
1,WK14,1
2,WK20,38
3,WK21,2
4,WK30,1
5,WK32,33
6,WK33,1
7,WK36,19
8,WK37,2
9,Total,132



Pregnancy Stage Statistics:


Unnamed: 0,Pregnancy Stage,Number of seg.nii.gz Files
0,Early Pregnancy (Week: 0-19),36
1,Mid Pregnancy (Week: 20-30),41
2,Late Pregnancy (Week 31-40),55
3,Total,132


# 2. Automatic file format conversion: nii to nii.gz

In [11]:
import os
import gzip
import shutil
import SimpleITK as sitk

# Define input and output folders
input_folder = "../Raw/ForSeg"
output_folder = "../Processed/PII_GZ"

os.makedirs(output_folder, exist_ok=True)

# Traverse all subfolders and files in the ForSeg folder
for folder_name in os.listdir(input_folder):
    folder_path = os.path.join(input_folder, folder_name)
    
    if not os.path.isdir(folder_path):
        continue
    
    # Process b0_img.nii and seg.nii
    b0_img_file = os.path.join(folder_path, "b0_img.nii")
    seg_file = os.path.join(folder_path, "seg.nii")
    
    if os.path.exists(b0_img_file):
        # Read and compress to nii.gz
        img = sitk.ReadImage(b0_img_file)
        compressed_file_path = os.path.join(output_folder, folder_name + "_b0_img.nii.gz")
        sitk.WriteImage(img, compressed_file_path)
    
    if os.path.exists(seg_file):
        # Read and compress to nii.gz
        img = sitk.ReadImage(seg_file)
        compressed_file_path = os.path.join(output_folder, folder_name + "_seg.nii.gz")
        sitk.WriteImage(img, compressed_file_path)

print("Conversion to .nii.gz completed!")

Conversion to .nii.gz completed!


# 3. Extract each folder's original MRI and its segmentation result MRI into images, labels, and folders respectively. 

In [6]:
import os
import shutil
import csv

# Define the file path
forseg_path = "../Processed/PII_GZ"  # Now use the compressed file path
images_path = os.path.join("../Processed/PII_Full", "images")
labels_path = os.path.join("../Processed/PII_Full", "labels")
csv_file = os.path.join("../Processed/PII_Full", "PII_log.csv")

if not os.path.exists(forseg_path):
    raise FileNotFoundError(f"Path doesnt exit: {forseg_path}")

# If the images and labels folders do not exist, create them
os.makedirs(images_path, exist_ok=True)
os.makedirs(labels_path, exist_ok=True)

# Initialize the global file number count
global_count = 1

# Open the CSV file and prepare to record the mapping between raw and separation
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Original MRI File", "Original Seg File", "Dataset", "New Filename"])
    
    # Traverse all files under PII_GZ
    for filename in os.listdir(forseg_path):
        if filename.endswith("_b0_img.nii.gz"):
            b0_img_file = os.path.join(forseg_path, filename)
            seg_file = os.path.join(forseg_path, filename.replace("_b0_img.nii.gz", "_seg.nii.gz"))
            
            if os.path.exists(b0_img_file) and os.path.exists(seg_file):
                new_file_name = f"PII_{str(global_count).zfill(3)}.nii.gz"
                
                # Copy b0_img.nii.gz to the images folder
                new_b0_img_path = os.path.join(images_path, new_file_name)
                shutil.copy2(b0_img_file, new_b0_img_path)
                
                # Copy seg.nii.gz to the labels folder
                new_seg_path = os.path.join(labels_path, new_file_name)
                shutil.copy2(seg_file, new_seg_path)
                
                # Logging to CSV file
                writer.writerow([b0_img_file, seg_file, "images", new_file_name])
                writer.writerow([b0_img_file, seg_file, "labels", new_file_name])
                
                # Increment global count
                global_count += 1

print("Task completed! File copies and CSV log created. \n Extract each folder's original MRI and its segmentation result MRI into images, labels, and folders respectively.")

Task completed! File copies and CSV log created. 
 Extract each folder's original MRI and its segmentation result MRI into images, labels, and folders respectively.


# 4.Dataset division by week range

In [19]:
import pandas as pd
import os
import shutil
import csv

# Step 3: Split data into train/test sets based on week range using PII_log.csv
def split_data_by_week_range(log_file, input_folder, output_base_folder, week_range):
    """
    Split the dataset into training and testing sets based on the week range specified in PII_log.csv.
    """
    # Read the PII_log.csv file
    df = pd.read_csv(log_file, header=0)
    
    # Define paths to the images and labels folders
    images_path = os.path.join(input_folder, "images")
    labels_path = os.path.join(input_folder, "labels")
    
    # Create a range name for the output folder
    range_name = f"{week_range[0]}-{week_range[1]}"
    output_folder = os.path.join(output_base_folder, f"PII_{range_name}")
    
    # Define paths for train/test subfolders
    images_tr_path = os.path.join(output_folder, "imagesTr")
    labels_tr_path = os.path.join(output_folder, "labelsTr")
    images_ts_path = os.path.join(output_folder, "imagesTs")
    labels_ts_path = os.path.join(output_folder, "labelsTs")
    
    # Create the output directories
    os.makedirs(images_tr_path, exist_ok=True)
    os.makedirs(labels_tr_path, exist_ok=True)
    os.makedirs(images_ts_path, exist_ok=True)
    os.makedirs(labels_ts_path, exist_ok=True)

    # Paths for train and test log files
    train_log = os.path.join(output_folder, "train_log.csv")
    test_log = os.path.join(output_folder, "test_log.csv")

    # Open the log files for writing
    with open(train_log, mode='w', newline='') as train_file, \
         open(test_log, mode='w', newline='') as test_file:
        train_writer = csv.writer(train_file)
        test_writer = csv.writer(test_file)
        
        # Write the headers for both CSV files
        train_writer.writerow(["Original MRI Path", "Original Seg Path", "New MRI Path", "New Seg Path"])
        test_writer.writerow(["Original MRI Path", "Original Seg Path", "New MRI Path", "New Seg Path"])

        # Iterate over rows in the PII_log.csv file
        for _, row in df.iterrows():
            # Define the paths for the original images and labels
            original_img = os.path.join(images_path, row["New Filename"])
            original_seg = os.path.join(labels_path, row["New Filename"])
            
            # Extract the week number from the original MRI file path
            week_num = int(row["Original MRI File"].split("WK")[1].split("_")[0])

            if week_range[0] <= week_num <= week_range[1]:
                # If the week number is within the specified range, add to the test set
                test_img = os.path.join(images_ts_path, row["New Filename"])
                test_seg = os.path.join(labels_ts_path, row["New Filename"])
                
                # Copy files to the test set
                shutil.copy2(original_img, test_img)
                shutil.copy2(original_seg, test_seg)
                
                # Log the operation in the test log
                test_writer.writerow([row["Original MRI File"], row["Original Seg File"], test_img, test_seg])
            else:
                # Otherwise, add to the train set
                train_img = os.path.join(images_tr_path, row["New Filename"])
                train_seg = os.path.join(labels_tr_path, row["New Filename"])
                
                # Copy files to the train set
                shutil.copy2(original_img, train_img)
                shutil.copy2(original_seg, train_seg)
                
                # Log the operation in the train log
                train_writer.writerow([row["Original MRI File"], row["Original Seg File"], train_img, train_seg])

    print(f"Data has been successfully split into training and testing sets based on the week range {week_range}!")

log_file = "../Processed/PII_Full/PII_log.csv"
input_folder = "../Processed/PII_Full"
output_folder = "../Processed"
# week_range = (0, 19)  # Specify the week range for the test set
# week_range = (20, 30)  # Specify the week range for the test set
# week_range = (31, 40)  # Specify the week range for the test set
# week_range = (0, 30)  # Specify the week range for the test set
# week_range = (20, 30)  # Specify the week range for the test set
# week_range = (31, 40)  # Specify the week range for the test set
split_data_by_week_range(log_file, input_folder, output_folder, week_range)

Data has been successfully split into training and testing sets based on the week range (31, 40)!


# 5. Create and modify the relevant json files in the final dataset to form a complete MRI dataset

In [6]:
import json
import os
from datetime import datetime

# Step 4: Update dataset.json
def update_dataset_json(output_folder, dataset_name, description, reference, license_info):
    """
    Updates the dataset.json file for the dataset.

    Args:
        output_folder (str): Path to the folder where the dataset.json will be saved.
        dataset_name (str): Name of the dataset.
        description (str): Description of the dataset.
        reference (str): Reference or citation for the dataset.
        license_info (str): License information for the dataset.
    """
    # Define paths for the dataset
    json_file_path = os.path.join(output_folder, "dataset.json")
    images_tr_path = os.path.join(output_folder, "imagesTr")
    images_ts_path = os.path.join(output_folder, "imagesTs")

    # Base structure of the dataset.json
    data = {
        "name": dataset_name,
        "description": description,
        "tensorImageSize": "3D",
        "reference": reference,
        "licence": license_info,
        "release": datetime.now().strftime("%Y-%m-%d"),
        "modality": {
            "0": "MRI"
        },
        "labels": {
            "0": "background",
            "1": "Placenta in Uterus"
        },
        "training": [],
        "test": []
    }

    # Populate the training section
    for img_file in os.listdir(images_tr_path):
        if img_file.endswith(".nii.gz"):  # Ensure only nii.gz files are considered
            data["training"].append({
                "image": f"./imagesTr/{img_file}",
                "label": f"./labelsTr/{img_file}"
            })

    # Populate the testing section
    for img_file in os.listdir(images_ts_path):
        if img_file.endswith(".nii.gz"):  # Ensure only nii.gz files are considered
            data["test"].append(f"./imagesTs/{img_file}")

    # Update numTraining and numTest based on the counts
    data["numTraining"] = len(data["training"])
    data["numTest"] = len(data["test"])

    # Write the updated data to the dataset.json file
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"dataset.json updated successfully at {json_file_path}!")

# Example usage
# output_folder = "../Processed/PII_00-19"
# output_folder = "../Processed/PII_20-30"
# output_folder = "../Processed/PII_31-40"
# output_folder = "../Processed/PII_SingleSet_final/00192030"
# output_folder = "../Processed/PII_SingleSet_final/00193140"
# output_folder = "../Processed/PII_SingleSet_final/20300019"
# output_folder = "../Processed/PII_SingleSet_final/20303140"
# output_folder = "../Processed/PII_SingleSet_final/31400019"
output_folder = "../Processed/PII_SingleSet_final/31402030"
update_dataset_json(
    output_folder,
    dataset_name="Placenta Segmentation",
    description="Segmentation of Placenta MRI Data",
    reference="Washington University in St. Louis",
    license_info="CC BY-NC 4.0"
)

dataset.json updated successfully at ../Processed/PII_SingleSet_final/31402030\dataset.json!
