In [2]:
import os
import cv2
import math
import numpy as np
import pydicom as pdcm
from skimage.draw import polygon
import xml.etree.ElementTree as ET
from matplotlib import pyplot as plt
import matplotlib.patches as patches

Seg&Det

In [34]:
import os
import pandas as pd
import numpy as np
from skimage.draw import polygon
import cv2
import pydicom as pdcm
import math
import xml.etree.ElementTree as ET

class Annotation:
    def __init__(self, xml_path, csv_path, filename, shape):
        self.xml_path = xml_path + filename + '.xml'
        self.csv_path = csv_path
        self.filename = filename
        self.shape = shape
        self.mask_mass = self.create_mask_array(shape)
        self.fill_mask()
        self.bboxes = self.load_bboxes_from_csv()

    def fill_mask(self):
        if os.path.exists(self.xml_path):
            rois, num_rois = self.parse_XML(self.xml_path)

            for roi in rois:
                roi_info = self.get_roi_info(roi)
                r_poly, c_poly = self.create_polygon_lists(self.mask_mass, roi_info["points"])
                rr, cc = polygon(r_poly, c_poly)
                try:
                    if roi_info["roi_type"] == "Mass":
                        self.mask_mass[rr, cc] = 1  # Mass mask
                except IndexError:
                    valid_idx = (rr >= 0) & (rr < self.mask_mass.shape[0]) & (cc >= 0) & (cc < self.mask_mass.shape[1])
                    rr = rr[valid_idx]
                    cc = cc[valid_idx]
                    if roi_info["roi_type"] == "Mass":
                        self.mask_mass[rr, cc] = 1
                    print('out of bound:', self.xml_path)

    def parse_XML(self, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()  # The root of the XML file
        data = root[0][1]  # The essential info
        rois = data[0][5]  # Array containing the ROI objects
        num_of_rois = int(data[0][3].text)  # Number of ROI objects
        return rois, num_of_rois

    def create_mask_array(self, img_shape):
        return np.zeros((img_shape[0], img_shape[1]), dtype=np.uint8)

    def get_roi_info(self, roi):
        roi_info = {
            "points": roi[21],  # Array containing the points of a ROI
            "num_of_points": int(roi[17].text),  # Number of points of the area
            "roi_index": int(roi[7].text),  # Identifier of the ROI
            "roi_type": roi[15].text  # (Mass, Calcification, other)
        }
        return roi_info

    def create_polygon_lists(self, mask, points):
        r_poly = np.array([])
        c_poly = np.array([])

        for point in points:
            temp_tuple = point.text[1:-1].split(",")
            y = int(math.trunc(float(temp_tuple[0])))
            x = int(math.trunc(float(temp_tuple[1])))
            r_poly = np.append(r_poly, x)
            c_poly = np.append(c_poly, y)

        return r_poly, c_poly

    def load_bboxes_from_csv(self):
        bboxes = []
        if os.path.exists(self.csv_path):
            df = pd.read_csv(self.csv_path)
            file_bboxes = df[df['File Name'] == np.int64(self.filename)]
            for _, row in file_bboxes.iterrows():
                x1 = row['X']
                y1 = row['Y']
                x2 = x1 + row['W']
                y2 = y1 + row['H']
                bboxes.append([x1, y1, x2, y2])
        return bboxes

    def preprocess(self, img, mask_mass):
        rows_to_keep = np.any(img != 0, axis=1)
        cols_to_keep = np.any(img != 0, axis=0)

        img = img[rows_to_keep][:, cols_to_keep]
        mask_mass = mask_mass[rows_to_keep][:, cols_to_keep]

        return img, mask_mass

    def find_cropped_image_position(self, full_image, cropped_image): 


        if cropped_image.shape[0] > full_image.shape[0] or cropped_image.shape[1] > full_image.shape[1]:
            raise ValueError("Cropped image is larger than the full image in one or both dimensions.")
        res = cv2.matchTemplate(full_image.astype(np.uint8), cropped_image.astype(np.uint8), cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
        top_left = max_loc
        bottom_right = (top_left[0] + cropped_image.shape[1], top_left[1] + cropped_image.shape[0])
        return (*top_left, *bottom_right)

    def save_data(self, img, mask, bboxes, output_dir, img_name):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # 保存原图为 JPG 格式
        jpg_path = os.path.join(output_dir, f'img.jpg')
        cv2.imwrite(jpg_path, img)

        # 保存mask为 PNG 格式
        png_path = os.path.join(output_dir, f'mask.png')
        cv2.imwrite(png_path, mask * 255)

        # 保存边界框数据为 NumPy 文件
        npy_path = os.path.join(output_dir, f'bboxes.npy')
        np.save(npy_path, np.array(bboxes))

        print(f'Mask, image, and bounding boxes saved for {img_name} in {output_dir}')

    def save_mask_and_image(self, dcm_path, output_base_path, img_name):
        dcm = pdcm.dcmread(dcm_path)
        img = dcm.pixel_array
        # 预处理图像和掩码
        processed_img, self.mask_mass = self.preprocess(img, self.mask_mass)
        # 裁剪出原始边界框对应的部分
        preprocessed_bboxes = []
        for bbox in self.bboxes:
            x_min, y_min, x_max, y_max = bbox
            cropped_img = img[int(y_min):int(y_max), int(x_min):int(x_max)]
            # 在预处理后的图像中匹配裁剪图像部分
            new_bbox = self.find_cropped_image_position(processed_img, cropped_img)
            preprocessed_bboxes.append(new_bbox)

        img = cv2.normalize(processed_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        if np.any(self.mask_mass == 1):
            output_dir = os.path.join(output_base_path, img_name)
            self.save_data(img, self.mask_mass, preprocessed_bboxes, output_dir, img_name)

# 遍历 DCM_PATH 中的所有 .dcm 文件
XML_PATH = "/Volumes/图图/INbreast/INbreast/AllXML/"
CSV_PATH = "/Volumes/图图/INBreast/INbreast/BoundingBoxes_Mass_Classes_2.csv"
DCM_PATH = "/Volumes/图图/INbreast/INbreast/AllDICOMs/"
OUTPUT_BASE_PATH = "/Volumes/图图/INbreast/INbreast/INBreast/"

for filename in os.listdir(DCM_PATH):
    if filename.endswith('.dcm') and not filename.startswith('._'):
        img_name = filename.split('.')[0]
        dcm_path = os.path.join(DCM_PATH, filename)
        dcm = pdcm.dcmread(dcm_path)
        img = dcm.pixel_array
        annotation = Annotation(XML_PATH, CSV_PATH, img_name, img.shape)

        # 保存 mask 和原图
        annotation.save_mask_and_image(dcm_path, OUTPUT_BASE_PATH, img_name)
    else:
        print(f"Skipping non-DICOM file: {filename}")


Skipping non-DICOM file: ._20586960.dcm
Mask, image, and bounding boxes saved for 20586960 in /Volumes/图图/INbreast/INbreast/INBreast/20586960
Mask, image, and bounding boxes saved for 20586986 in /Volumes/图图/INbreast/INbreast/INBreast/20586986
Skipping non-DICOM file: ._20586986.dcm
Skipping non-DICOM file: ._20587054.dcm
Skipping non-DICOM file: ._20587080.dcm
Skipping non-DICOM file: ._20587148.dcm
Skipping non-DICOM file: ._20587174.dcm
Skipping non-DICOM file: ._20587200.dcm
Skipping non-DICOM file: ._20587226.dcm
Skipping non-DICOM file: ._20587294.dcm
Skipping non-DICOM file: ._20587320.dcm
Skipping non-DICOM file: ._20587346.dcm
Skipping non-DICOM file: ._20587372.dcm
Skipping non-DICOM file: ._20587466.dcm
Skipping non-DICOM file: ._20587492.dcm
Skipping non-DICOM file: ._20587518.dcm
Skipping non-DICOM file: ._20587544.dcm
Mask, image, and bounding boxes saved for 20587612 in /Volumes/图图/INbreast/INbreast/INBreast/20587612
Skipping non-DICOM file: ._20587612.dcm
Skipping non-D

Classification

In [1]:
import os
import pandas as pd
import numpy as np
import pydicom as pdcm
import cv2
import json
from sklearn.model_selection import train_test_split

def preprocess(img):
    # 找到原图中所有为0的行和列
    rows_to_keep = np.any(img != 0, axis=1)
    cols_to_keep = np.any(img != 0, axis=0)

    # 删除原图中全部为0的行和列
    img = img[rows_to_keep][:, cols_to_keep]

    return img

# 读取XLS文件
xls_path = '/Volumes/图图/INBreast/INbreast/INbreast.xls'
df = pd.read_excel(xls_path)

# 排序并按8:2比例分成训练集和测试集
df_sorted = df.sort_values(by='File Name')
train_df, test_df = train_test_split(df_sorted, test_size=0.2, random_state=42)

# 定义输入和输出路径
DCM_PATH = "/Volumes/图图/INBreast/INbreast/AllDICOMs"
OUTPUT_BASE_PATH = "/Volumes/图图/INBreast/INbreast/classification"

def process_and_save(df, set_type):
    for index, row in df.iterrows():
        file_name = str(row['File Name']).split('.')[0]
        meta_data = {
            'Laterality': str(row['Laterality']).replcace(' ',''),
            'View': str(row['View']).replcace(' ',''),
            'ACR': str(row['ACR']).replcace(' ',''),
            'Bi-Rads': str(row['Bi-Rads']).replcace(' ','')
        }

        # 读取DICOM文件
        dcm_path = os.path.join(DCM_PATH, file_name + '.dcm')
        if os.path.exists(dcm_path):
            dcm = pdcm.dcmread(dcm_path)
            img = dcm.pixel_array
            img = preprocess(img)
            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
            # 保存图像为JPG
            img_output_path = os.path.join(OUTPUT_BASE_PATH, set_type, file_name)
            if not os.path.exists(img_output_path):
                os.makedirs(img_output_path)
            jpg_path = os.path.join(img_output_path, 'img.jpg')
            cv2.imwrite(jpg_path, img)

            # 保存元数据为JSON
            npy_path = os.path.join(img_output_path, 'info_dict.npy')
            np.save(npy_path, meta_data)

            print(f"Processed {file_name} for {set_type} set")
        else:
            print(f"DICOM file for {file_name} not found.")

# 处理并保存训练集和测试集
process_and_save(train_df, 'Train')
process_and_save(test_df, 'Test')


Processed 20587080 for Train set
Processed 22427705 for Train set
Processed 53587454 for Train set
Processed 53586869 for Train set
Processed 22670809 for Train set
Processed 53582737 for Train set
Processed 50994706 for Train set
Processed 50998059 for Train set
Processed 20586986 for Train set
Processed 20587612 for Train set
Processed 22670673 for Train set
Processed 22580192 for Train set
Processed 51048738 for Train set
Processed 53587104 for Train set
Processed 22580270 for Train set
Processed 53582764 for Train set
Processed 26933830 for Train set
Processed 22670442 for Train set
Processed 50998440 for Train set
Processed 50993841 for Train set
Processed 22670147 for Train set
Processed 50997651 for Train set
Processed 53587508 for Train set
Processed 50994589 for Train set
Processed 50994327 for Train set
Processed 22614568 for Train set
Processed 50994868 for Train set
Processed 20588020 for Train set
Processed 22678694 for Train set
Processed 53586361 for Train set
Processed 