In [2]:
import os
import cv2
import math
import numpy as np
import pydicom as pdcm
from skimage.draw import polygon
import xml.etree.ElementTree as ET
from matplotlib import pyplot as plt
import matplotlib.patches as patches

Seg&Det

In [1]:
import os
import pandas as pd
import numpy as np
from skimage.draw import polygon
import cv2
import pydicom as pdcm
import math
import xml.etree.ElementTree as ET

class Annotation:
    def __init__(self, xml_path, csv_path, filename, shape):
        self.xml_path = xml_path + filename + '.xml'
        self.csv_path = csv_path
        self.filename = filename
        self.shape = shape
        self.mask_mass = self.create_mask_array(shape)
        self.fill_mask()
        self.bboxes = self.load_bboxes_from_csv()

    def fill_mask(self):
        if os.path.exists(self.xml_path):
            rois, num_rois = self.parse_XML(self.xml_path)

            for roi in rois:
                roi_info = self.get_roi_info(roi)
                r_poly, c_poly = self.create_polygon_lists(self.mask_mass, roi_info["points"])
                rr, cc = polygon(r_poly, c_poly)
                try:
                    if roi_info["roi_type"] == "Mass":
                        self.mask_mass[rr, cc] = 1  # Mass mask
                except IndexError:
                    valid_idx = (rr >= 0) & (rr < self.mask_mass.shape[0]) & (cc >= 0) & (cc < self.mask_mass.shape[1])
                    rr = rr[valid_idx]
                    cc = cc[valid_idx]
                    if roi_info["roi_type"] == "Mass":
                        self.mask_mass[rr, cc] = 1
                    print('out of bound:', self.xml_path)

    def parse_XML(self, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()  # The root of the XML file
        data = root[0][1]  # The essential info
        rois = data[0][5]  # Array containing the ROI objects
        num_of_rois = int(data[0][3].text)  # Number of ROI objects
        return rois, num_of_rois

    def create_mask_array(self, img_shape):
        return np.zeros((img_shape[0], img_shape[1]), dtype=np.uint8)

    def get_roi_info(self, roi):
        roi_info = {
            "points": roi[21],  # Array containing the points of a ROI
            "num_of_points": int(roi[17].text),  # Number of points of the area
            "roi_index": int(roi[7].text),  # Identifier of the ROI
            "roi_type": roi[15].text  # (Mass, Calcification, other)
        }
        return roi_info

    def create_polygon_lists(self, mask, points):
        r_poly = np.array([])
        c_poly = np.array([])

        for point in points:
            temp_tuple = point.text[1:-1].split(",")
            y = int(math.trunc(float(temp_tuple[0])))
            x = int(math.trunc(float(temp_tuple[1])))
            r_poly = np.append(r_poly, x)
            c_poly = np.append(c_poly, y)

        return r_poly, c_poly

    def load_bboxes_from_csv(self):
        bboxes = []
        if os.path.exists(self.csv_path):
            df = pd.read_csv(self.csv_path)
            file_bboxes = df[df['File Name'] == np.int64(self.filename)]
            for _, row in file_bboxes.iterrows():
                x1 = row['X']
                y1 = row['Y']
                x2 = x1 + row['W']
                y2 = y1 + row['H']
                bboxes.append([x1, y1, x2, y2])
        return bboxes

    def np_CountUpContinuingOnes(self,b_arr):
        # indice continuing zeros from left side.
        # ex: [0,1,1,0,1,0,0,1,1,1,0] -> [0,0,0,3,3,5,6,6,6,6,10]
        left = np.arange(len(b_arr))
        left[b_arr > 0] = 0
        left = np.maximum.accumulate(left)

        # from right side.
        # ex: [0,1,1,0,1,0,0,1,1,1,0] -> [0,3,3,3,5,5,6,10,10,10,10]
        rev_arr = b_arr[::-1]
        right = np.arange(len(rev_arr))
        right[rev_arr > 0] = 0
        right = np.maximum.accumulate(right)
        right = len(rev_arr) - 1 - right[::-1]

        return right - left - 1


    def adjust_bounding_box(self,original_coords, left_crop, top_crop):
        x1, y1, x2, y2 = original_coords

        x1_new = x1 - left_crop
        y1_new = y1 - top_crop
        x2_new = x2 - left_crop
        y2_new = y2 - top_crop

        return x1_new, y1_new, x2_new, y2_new


    def ExtractBreast(self, img, mask, true_bounding_boxes):
        img_copy = img.copy()
        mask_copy = mask.copy()

        # 将像素值小于等于20的部分设为0，以便更容易检测背景
        img = np.where(img <= 20, 0, img)
        height, _ = img.shape

        # 处理列方向（垂直方向）
        y_a = height // 2 + int(height * 0.4)
        y_b = height // 2 - int(height * 0.4)
        b_arr = img[y_b:y_a].std(axis=0) != 0
        continuing_ones = self.np_CountUpContinuingOnes(b_arr)
        # 最长的非零区域应为乳房部分
        col_ind = np.where(continuing_ones == continuing_ones.max())[0]
        
        # 对图像和掩码进行列裁剪
        img = img[:, col_ind]
        mask = mask[:, col_ind]

        # 处理行方向（水平方向）
        _, width = img.shape
        x_a = width // 2 + int(width * 0.4)
        x_b = width // 2 - int(width * 0.4)
        b_arr = img[:, x_b:x_a].std(axis=1) != 0
        continuing_ones = self.np_CountUpContinuingOnes(b_arr)
        # 最长的非零区域应为乳房部分
        row_ind = np.where(continuing_ones == continuing_ones.max())[0]
        
        # 对图像和掩码进行行裁剪
        img = img[row_ind, :]
        mask = mask[row_ind, :]

        # 调整边界框坐标
        adjusted_bboxes = []
        for bbox in true_bounding_boxes:
            adjusted_coords = self.adjust_bounding_box(bbox, col_ind[0], row_ind[0])
            adjusted_bboxes.append(adjusted_coords)

        return img_copy[row_ind][:, col_ind], mask_copy[row_ind][:, col_ind], adjusted_bboxes


    # def find_cropped_image_position(self, full_image, cropped_image): 


    #     if cropped_image.shape[0] > full_image.shape[0] or cropped_image.shape[1] > full_image.shape[1]:
    #         raise ValueError("Cropped image is larger than the full image in one or both dimensions.")
    #     res = cv2.matchTemplate(full_image.astype(np.uint8), cropped_image.astype(np.uint8), cv2.TM_CCOEFF_NORMED)
    #     min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
    #     top_left = max_loc
    #     bottom_right = (top_left[0] + cropped_image.shape[1], top_left[1] + cropped_image.shape[0])
    #     return (*top_left, *bottom_right)

    def save_data(self, img, mask, bboxes, output_dir, img_name):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # 保存原图为 JPG 格式
        jpg_path = os.path.join(output_dir, f'img.jpg')
        cv2.imwrite(jpg_path, img)

        # 保存mask为 PNG 格式
        png_path = os.path.join(output_dir, f'mask.png')
        cv2.imwrite(png_path, mask * 255)

        # 保存边界框数据为 NumPy 文件
        npy_path = os.path.join(output_dir, f'bboxes.npy')
        np.save(npy_path, np.array(bboxes))

        print(f'Mask, image, and bounding boxes saved for {img_name} in {output_dir}')

    def save_mask_and_image(self, dcm_path, output_base_path, img_name):
        dcm = pdcm.dcmread(dcm_path)
        img = dcm.pixel_array
        # 预处理图像和掩码
        processed_img, self.mask_mass, processed_bboxes = self.ExtractBreast(img, self.mask_mass, self.bboxes)
        # 裁剪出原始边界框对应的部分
        # preprocessed_bboxes = []
        # for bbox in self.bboxes:
        #     x_min, y_min, x_max, y_max = bbox
        #     cropped_img = img[int(y_min):int(y_max), int(x_min):int(x_max)]
        #     # 在预处理后的图像中匹配裁剪图像部分
        #     new_bbox = self.find_cropped_image_position(processed_img, cropped_img)
        #     preprocessed_bboxes.append(new_bbox)

        img = cv2.normalize(processed_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        if np.any(self.mask_mass == 1):
            output_dir = os.path.join(output_base_path, img_name)
            self.save_data(img, self.mask_mass, processed_bboxes, output_dir, img_name)

# 遍历 DCM_PATH 中的所有 .dcm 文件
XML_PATH = "/Volumes/图图/INbreast/INbreast/AllXML/"
CSV_PATH = "/Volumes/图图/INBreast/INbreast/BoundingBoxes_Mass_Classes_2.csv"
DCM_PATH = "/Volumes/图图/INbreast/INbreast/AllDICOMs/"
OUTPUT_BASE_PATH = "/Volumes/图图/INbreast/INbreast/seg&det/"

for filename in os.listdir(DCM_PATH):
    if filename.endswith('.dcm') and not filename.startswith('._'):
        img_name = filename.split('.')[0]
        dcm_path = os.path.join(DCM_PATH, filename)
        dcm = pdcm.dcmread(dcm_path)
        img = dcm.pixel_array
        annotation = Annotation(XML_PATH, CSV_PATH, img_name, img.shape)

        # 保存 mask 和原图
        annotation.save_mask_and_image(dcm_path, OUTPUT_BASE_PATH, img_name)
    else:
        print(f"Skipping non-DICOM file: {filename}")


Skipping non-DICOM file: ._20586960.dcm
Mask, image, and bounding boxes saved for 20586960 in /Volumes/图图/INbreast/INbreast/seg&det/20586960
Mask, image, and bounding boxes saved for 20586986 in /Volumes/图图/INbreast/INbreast/seg&det/20586986
Skipping non-DICOM file: ._20586986.dcm
Skipping non-DICOM file: ._20587054.dcm
Skipping non-DICOM file: ._20587080.dcm
Skipping non-DICOM file: ._20587148.dcm
Skipping non-DICOM file: ._20587174.dcm
Skipping non-DICOM file: ._20587200.dcm
Skipping non-DICOM file: ._20587226.dcm
Skipping non-DICOM file: ._20587294.dcm
Skipping non-DICOM file: ._20587320.dcm
Skipping non-DICOM file: ._20587346.dcm
Skipping non-DICOM file: ._20587372.dcm
Skipping non-DICOM file: ._20587466.dcm
Skipping non-DICOM file: ._20587492.dcm
Skipping non-DICOM file: ._20587518.dcm
Skipping non-DICOM file: ._20587544.dcm
Mask, image, and bounding boxes saved for 20587612 in /Volumes/图图/INbreast/INbreast/seg&det/20587612
Skipping non-DICOM file: ._20587612.dcm
Skipping non-DICO

seg&det_split

In [1]:
import os
import pandas as pd
import numpy as np
from skimage.draw import polygon
import cv2
import pydicom as pdcm
import math
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

class Annotation:
    def __init__(self, xml_path, csv_path, filename, shape):
        self.xml_path = xml_path + filename + '.xml'
        self.csv_path = csv_path
        self.filename = filename
        self.shape = shape
        self.mask_mass = self.create_mask_array(shape)
        self.fill_mask()
        self.bboxes = self.load_bboxes_from_csv()

    def fill_mask(self):
        if os.path.exists(self.xml_path):
            rois, num_rois = self.parse_XML(self.xml_path)

            for roi in rois:
                roi_info = self.get_roi_info(roi)
                r_poly, c_poly = self.create_polygon_lists(self.mask_mass, roi_info["points"])
                rr, cc = polygon(r_poly, c_poly)
                try:
                    if roi_info["roi_type"] == "Mass":
                        self.mask_mass[rr, cc] = 1  # Mass mask
                except IndexError:
                    valid_idx = (rr >= 0) & (rr < self.mask_mass.shape[0]) & (cc >= 0) & (cc < self.mask_mass.shape[1])
                    rr = rr[valid_idx]
                    cc = cc[valid_idx]
                    if roi_info["roi_type"] == "Mass":
                        self.mask_mass[rr, cc] = 1
                    print('out of bound:', self.xml_path)

    def parse_XML(self, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()  # The root of the XML file
        data = root[0][1]  # The essential info
        rois = data[0][5]  # Array containing the ROI objects
        num_of_rois = int(data[0][3].text)  # Number of ROI objects
        return rois, num_of_rois

    def create_mask_array(self, img_shape):
        return np.zeros((img_shape[0], img_shape[1]), dtype=np.uint8)

    def get_roi_info(self, roi):
        roi_info = {
            "points": roi[21],  # Array containing the points of a ROI
            "num_of_points": int(roi[17].text),  # Number of points of the area
            "roi_index": int(roi[7].text),  # Identifier of the ROI
            "roi_type": roi[15].text  # (Mass, Calcification, other)
        }
        return roi_info

    def create_polygon_lists(self, mask, points):
        r_poly = np.array([])
        c_poly = np.array([])

        for point in points:
            temp_tuple = point.text[1:-1].split(",")
            y = int(math.trunc(float(temp_tuple[0])))
            x = int(math.trunc(float(temp_tuple[1])))
            r_poly = np.append(r_poly, x)
            c_poly = np.append(c_poly, y)

        return r_poly, c_poly

    def load_bboxes_from_csv(self):
        bboxes = []
        if os.path.exists(self.csv_path):
            df = pd.read_csv(self.csv_path)
            file_bboxes = df[df['File Name'] == np.int64(self.filename)]
            for _, row in file_bboxes.iterrows():
                x1 = row['X']
                y1 = row['Y']
                x2 = x1 + row['W']
                y2 = y1 + row['H']
                bboxes.append([x1, y1, x2, y2])
        return bboxes

    def preprocess(self, img, mask_mass):
        rows_to_keep = np.any(img != 0, axis=1)
        cols_to_keep = np.any(img != 0, axis=0)

        img = img[rows_to_keep][:, cols_to_keep]
        mask_mass = mask_mass[rows_to_keep][:, cols_to_keep]

        return img, mask_mass

    def find_cropped_image_position(self, full_image, cropped_image): 
        if cropped_image.shape[0] > full_image.shape[0] or cropped_image.shape[1] > full_image.shape[1]:
            raise ValueError("Cropped image is larger than the full image in one or both dimensions.")
        res = cv2.matchTemplate(full_image.astype(np.uint8), cropped_image.astype(np.uint8), cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
        top_left = max_loc
        bottom_right = (top_left[0] + cropped_image.shape[1], top_left[1] + cropped_image.shape[0])
        return (*top_left, *bottom_right)

    def save_data(self, img, mask, bboxes, output_dir, img_name):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # 保存原图为 JPG 格式
        jpg_path = os.path.join(output_dir, f'img.jpg')
        cv2.imwrite(jpg_path, img)

        # 保存mask为 PNG 格式
        png_path = os.path.join(output_dir, f'mask.png')
        cv2.imwrite(png_path, mask * 255)

        # 保存边界框数据为 NumPy 文件
        npy_path = os.path.join(output_dir, f'bboxes.npy')
        np.save(npy_path, np.array(bboxes))

        print(f'Mask, image, and bounding boxes saved for {img_name} in {output_dir}')

    def save_mask_and_image(self, dcm_path, output_base_path, img_name):
        dcm = pdcm.dcmread(dcm_path)
        img = dcm.pixel_array
        # 预处理图像和掩码
        processed_img, self.mask_mass = self.preprocess(img, self.mask_mass)
        # 裁剪出原始边界框对应的部分
        preprocessed_bboxes = []
        for bbox in self.bboxes:
            x_min, y_min, x_max, y_max = bbox
            cropped_img = img[int(y_min):int(y_max), int(x_min):int(x_max)]
            # 在预处理后的图像中匹配裁剪图像部分
            new_bbox = self.find_cropped_image_position(processed_img, cropped_img)
            preprocessed_bboxes.append(new_bbox)

        img = cv2.normalize(processed_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        if np.any(self.mask_mass == 1):
            output_dir = os.path.join(output_base_path, img_name)
            self.save_data(img, self.mask_mass, preprocessed_bboxes, output_dir, img_name)

# 路径设置
XML_PATH = "/Volumes/图图/INbreast/INbreast/AllXML/"
CSV_PATH = "/Volumes/图图/INBreast/INbreast/BoundingBoxes_Mass_Classes_2.csv"
DCM_PATH = "/Volumes/图图/INbreast/INbreast/AllDICOMs/"
OUTPUT_BASE_PATH = "/Volumes/图图/INbreast/INbreast/INBreast/"

# 获取所有 DICOM 文件列表
dicom_files = [f for f in os.listdir(DCM_PATH) if f.endswith('.dcm') and not f.startswith('._')]

# 按8:2分割训练集和测试集
train_files, test_files = train_test_split(dicom_files, test_size=0.2, random_state=42)

# 处理并保存训练集
for filename in train_files:
    img_name = filename.split('.')[0]
    dcm_path = os.path.join(DCM_PATH, filename)
    dcm = pdcm.dcmread(dcm_path)
    img = dcm.pixel_array
    annotation = Annotation(XML_PATH, CSV_PATH, img_name, img.shape)

    # 保存 mask 和原图
    annotation.save_mask_and_image(dcm_path, os.path.join(OUTPUT_BASE_PATH, 'train'), img_name)

# 处理并保存测试集
for filename in test_files:
    img_name = filename.split('.')[0]
    dcm_path = os.path.join(DCM_PATH, filename)
    dcm = pdcm.dcmread(dcm_path)
    img = dcm.pixel_array
    annotation = Annotation(XML_PATH, CSV_PATH, img_name, img.shape)

    # 保存 mask 和原图
    annotation.save_mask_and_image(dcm_path, os.path.join(OUTPUT_BASE_PATH, 'test'), img_name)

print("Dataset split and saved successfully.")


Mask, image, and bounding boxes saved for 24055483 in /Volumes/图图/INbreast/INbreast/INBreast/train/24055483
Mask, image, and bounding boxes saved for 24065289 in /Volumes/图图/INbreast/INbreast/INBreast/train/24065289
Mask, image, and bounding boxes saved for 51049107 in /Volumes/图图/INbreast/INbreast/INBreast/train/51049107
out of bound: /Volumes/图图/INbreast/INbreast/AllXML/22670620.xml
Mask, image, and bounding boxes saved for 22670620 in /Volumes/图图/INbreast/INbreast/INBreast/train/22670620
Mask, image, and bounding boxes saved for 24065251 in /Volumes/图图/INbreast/INbreast/INBreast/train/24065251
Mask, image, and bounding boxes saved for 20587902 in /Volumes/图图/INbreast/INbreast/INBreast/train/20587902
Mask, image, and bounding boxes saved for 22614568 in /Volumes/图图/INbreast/INbreast/INBreast/train/22614568
Mask, image, and bounding boxes saved for 20588046 in /Volumes/图图/INbreast/INbreast/INBreast/train/20588046
Mask, image, and bounding boxes saved for 24065530 in /Volumes/图图/INbrea

cropped-seg

In [3]:
import os
import numpy as np
from PIL import Image
import shutil

def crop_and_save(image, mask, bbox, output_dir):
    x1, y1, x2, y2 = bbox
    # Crop the image and mask
    cropped_image = image.crop((x1, y1, x2, y2))
    cropped_mask = mask.crop((x1, y1, x2, y2))

    # Save the cropped image and mask
    cropped_image.save(os.path.join(output_dir, 'img.jpg'))
    cropped_mask.save(os.path.join(output_dir, 'mask.png'))

def process_directory(input_dir, output_base_dir):
    # Walk through each folder in the directory
    for root, dirs, files in os.walk(input_dir):
        if 'bboxes.npy' in files:
            print('here')
            # Load bbox coordinates
            
            bbox_file = os.path.join(root, 'bboxes.npy')
            # print(bbox_file)
            bboxes = np.load(bbox_file)
            # print(bboxes)
            # Load the image and mask
            image_file = os.path.join(root, 'img.jpg')
            mask_file = os.path.join(root, 'mask.png')
            image = Image.open(image_file)
            mask = Image.open(mask_file)
            
            # Create corresponding output directory
            relative_path = os.path.relpath(root, input_dir)
            output_dir = os.path.join(output_base_dir, relative_path)
            # os.makedirs(output_dir, exist_ok=True)
            
            # Process each bbox
            for i, bbox in enumerate(bboxes):
                # Create subdirectory for each cropped image
                cropped_output_dir = output_dir+f'_{i}'
                os.makedirs(cropped_output_dir, exist_ok=True)
                
                # Crop and save
                crop_and_save(image, mask, bbox, cropped_output_dir)
                

def main():
    base_dir = '/Volumes/图图/INBreast/INbreast/seg&det'  # Modify with actual base directory
    output_dir = '/Volumes/图图/INBreast/INbreast/cropped_seg'  # Modify with desired output directory

    # Ensure the output directory is clean
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    
    # Process Train and Test directories
    # data_dir = os.path.join(base_dir, 'Train')
    # test_dir = os.path.join(base_dir, 'Test')
    
    process_directory(base_dir, output_dir)
    # process_directory(test_dir, output_dir+'/Test')

if __name__ == '__main__':
    main()


here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


Classification

In [3]:
import os
import pandas as pd
import numpy as np
import pydicom as pdcm
import cv2
import json
from sklearn.model_selection import train_test_split

def np_CountUpContinuingOnes(b_arr):
    # indice continuing zeros from left side.
    # ex: [0,1,1,0,1,0,0,1,1,1,0] -> [0,0,0,3,3,5,6,6,6,6,10]
    left = np.arange(len(b_arr))
    left[b_arr > 0] = 0
    left = np.maximum.accumulate(left)

    # from right side.
    # ex: [0,1,1,0,1,0,0,1,1,1,0] -> [0,3,3,3,5,5,6,10,10,10,10]
    rev_arr = b_arr[::-1]
    right = np.arange(len(rev_arr))
    right[rev_arr > 0] = 0
    right = np.maximum.accumulate(right)
    right = len(rev_arr) - 1 - right[::-1]

    return right - left - 1


def ExtractBreast(img):
    img_copy = img.copy()
    img = np.where(img <= 20, 0, img)  # To detect backgrounds easily
    height, _ = img.shape

    # whether each col is non-constant or not
    y_a = height // 2 + int(height * 0.4)
    y_b = height // 2 - int(height * 0.4)
    b_arr = img[y_b:y_a].std(axis=0) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    # longest should be the breast
    col_ind = np.where(continuing_ones == continuing_ones.max())[0]
    img = img[:, col_ind]

    # whether each row is non-constant or not
    _, width = img.shape
    x_a = width // 2 + int(width * 0.4)
    x_b = width // 2 - int(width * 0.4)
    b_arr = img[:, x_b:x_a].std(axis=1) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    # longest should be the breast
    row_ind = np.where(continuing_ones == continuing_ones.max())[0]

    return img_copy[row_ind][:, col_ind]

# 读取XLS文件
xls_path = '/Volumes/图图/INBreast/INbreast/INbreast.xls'
df = pd.read_excel(xls_path)

# 排序并按8:2比例分成训练集和测试集
df_sorted = df.sort_values(by='File Name')
train_df, test_df = train_test_split(df_sorted, test_size=0.2, random_state=42)

# 定义输入和输出路径
DCM_PATH = "/Volumes/图图/INBreast/INbreast/AllDICOMs"
OUTPUT_BASE_PATH = "/Volumes/图图/INBreast/INbreast/classification"

def process_and_save(df, set_type):
    for index, row in df.iterrows():
        file_name = str(row['File Name']).split('.')[0]
        meta_data = {
            'Laterality': str(row['Laterality']).replace(' ',''),
            'View': str(row['View']).replace(' ',''),
            'ACR': str(row['ACR']).replace(' ',''),
            'Bi-Rads': str(row['Bi-Rads']).replace(' ','')
        }

        # 读取DICOM文件
        dcm_path = os.path.join(DCM_PATH, file_name + '.dcm')
        if os.path.exists(dcm_path):
            dcm = pdcm.dcmread(dcm_path)
            img = dcm.pixel_array
            img = ExtractBreast(img)
            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
            # 保存图像为JPG
            img_output_path = os.path.join(OUTPUT_BASE_PATH, set_type, file_name)
            if not os.path.exists(img_output_path):
                os.makedirs(img_output_path)
            jpg_path = os.path.join(img_output_path, 'img.jpg')
            cv2.imwrite(jpg_path, img)

            # 保存元数据为JSON
            npy_path = os.path.join(img_output_path, 'info_dict.npy')
            np.save(npy_path, meta_data)

            print(f"Processed {file_name} for {set_type} set")
        else:
            print(f"DICOM file for {file_name} not found.")

# 处理并保存训练集和测试集
process_and_save(train_df, 'Train')
process_and_save(test_df, 'Test')


Processed 20587080 for Train set
Processed 22427705 for Train set
Processed 53587454 for Train set
Processed 53586869 for Train set
Processed 22670809 for Train set
Processed 53582737 for Train set
Processed 50994706 for Train set
Processed 50998059 for Train set
Processed 20586986 for Train set
Processed 20587612 for Train set
Processed 22670673 for Train set
Processed 22580192 for Train set
Processed 51048738 for Train set
Processed 53587104 for Train set
Processed 22580270 for Train set
Processed 53582764 for Train set
Processed 26933830 for Train set
Processed 22670442 for Train set
Processed 50998440 for Train set
Processed 50993841 for Train set
Processed 22670147 for Train set
Processed 50997651 for Train set
Processed 53587508 for Train set
Processed 50994589 for Train set
Processed 50994327 for Train set
Processed 22614568 for Train set
Processed 50994868 for Train set
Processed 20588020 for Train set
Processed 22678694 for Train set
Processed 53586361 for Train set
Processed 