DMID-breast

In [None]:
import os
import numpy as np
import pandas as pd
import pydicom as pdcm
import cv2

def np_CountUpContinuingOnes(b_arr):
    left = np.arange(len(b_arr))
    left[b_arr > 0] = 0
    left = np.maximum.accumulate(left)
    rev_arr = b_arr[::-1]
    right = np.arange(len(rev_arr))
    right[rev_arr > 0] = 0
    right = np.maximum.accumulate(right)
    right = len(rev_arr) - 1 - right[::-1]
    return right - left - 1

def ExtractBreast(img):
    img_copy = img.copy()
    img = np.where(img <= 40, 0, img)
    height, _ = img.shape
    y_a = height // 2 + int(height * 0.4)
    y_b = height // 2 - int(height * 0.4)
    b_arr = img[y_b:y_a].std(axis=0) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    col_ind = np.where(continuing_ones == continuing_ones.max())[0]
    img = img[:, col_ind]
    _, width = img.shape
    x_a = width // 2 + int(width * 0.4)
    x_b = width // 2 - int(width * 0.4)
    b_arr = img[:, x_b:x_a].std(axis=1) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    row_ind = np.where(continuing_ones == continuing_ones.max())[0]
    return img_copy[row_ind][:, col_ind]

xlsx_path = '/Volumes/图图/DMID-kaggle/archive/Metadata.xlsx'
df = pd.read_excel(xlsx_path)

def merge_duplicates(df):
    merged_data = {}
    for _, row in df.iterrows():
        file_id = row['ID'].strip()
        abnormality = str(row['abnormality']).replace(' ', '')
        pathology = str(row['pathology']).replace(' ', '')
        view_dir = str(row['view']).replace(' ', '')
        left_or_right = 'Left' if view_dir.endswith('LT') else 'Right'
        background_tissue = str(row['background tissue']).replace(' ', '')

        if abnormality == 'NORM':
            pathology = 'N'
        elif pathology == 'N':
            pathology = 'B'
        abnormality = abnormality.split('+') if '+' in abnormality else [abnormality]

        if file_id in merged_data:
            merged_data[file_id]['abnormality'].extend(abnormality)
            if pathology == 'M':
                merged_data[file_id]['pathology'] = 'M'
            if merged_data[file_id]['view'] != view_dir:
                merged_data[file_id]['view'] = view_dir
            if merged_data[file_id]['left_or_right'] != left_or_right:
                merged_data[file_id]['left_or_right'] = left_or_right
            if background_tissue != '-' and merged_data[file_id]['background_tissue'] != background_tissue:
                merged_data[file_id]['background_tissue'] = background_tissue
        else:
            merged_data[file_id] = {
                'view': view_dir,
                'left_or_right': left_or_right,
                'background_tissue': background_tissue,
                'abnormality': abnormality,
                'pathology': pathology
            }

    for file_id, data in merged_data.items():
        data['abnormality'] = list(set(data['abnormality']))

    return merged_data


merged_data = merge_duplicates(df)
df_sorted = pd.DataFrame(list(merged_data.items()), columns=['ID', 'data'])


DCM_PATH = "/Volumes/图图/DMID-kaggle/archive/DICOM Images/DICOM Images"
OUTPUT_BASE_PATH = "Benchmark/DMID-breast"

def process_and_save(df):
    for _, row in df.iterrows():
        file_id = row['ID'].strip()

        dcm_file = os.path.join(DCM_PATH, f"{file_id}.dcm")
        if os.path.exists(dcm_file):
            dcm = pdcm.dcmread(dcm_file)
            img = dcm.pixel_array
            if len(img.shape) == 3 and img.shape[2] == 3:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img = ExtractBreast(img)
            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

            output_dir = os.path.join(OUTPUT_BASE_PATH, file_id)
            os.makedirs(output_dir, exist_ok=True)

            jpg_path = os.path.join(output_dir, 'img.jpg')
            cv2.imwrite(jpg_path, img)
            print(f"Processed {file_id}")
        else:
            print(f"DICOM file for {file_id} not found.")


process_and_save(df_sorted)

cropped classification

In [None]:
import os
import numpy as np
import pandas as pd
import pydicom as pdcm
import cv2

def crop_and_save(img, x, y, radius, output_path):
    flag = True
    try:
        x = int(x)
        y = int(y)
        radius = int(radius)
        x1 = max(x - radius, 0)
        y1 = max(y - radius, 0)
        x2 = min(x + radius, img.shape[1])
        y2 = min(y + radius, img.shape[0])

        cropped_img = img[y1:y2, x1:x2]
        cropped_img = cv2.normalize(cropped_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        cv2.imwrite(output_path, cropped_img)
    except:
        flag = False
    return flag

xlsx_path = '/Volumes/图图/DMID-kaggle/archive/Metadata.xlsx'
df = pd.read_excel(xlsx_path)
df = df[df['abnormality'].replace(' ', '') != 'NORM']


DCM_PATH = "/Volumes/图图/DMID-kaggle/archive/DICOM Images/DICOM Images"
OUTPUT_BASE_PATH = "Benchmark/DMID-finding"

def process_and_save(df, global_counter):
    for index, row in df.iterrows():


        file_id = row['ID'].strip()
        x, y, radius = row['x'], row['y'], row['radius']


        if file_id in global_counter:
            global_counter[file_id] += 1
        else:
            global_counter[file_id] = 1
        file_id_with_suffix = f"{file_id}_{global_counter[file_id]}"


        dcm_file = os.path.join(DCM_PATH, f"{file_id}.dcm")
        if os.path.exists(dcm_file):
            dcm = pdcm.dcmread(dcm_file)
            img = dcm.pixel_array
            if img.ndim == 3 and img.shape[2] == 3:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)


            output_dir = os.path.join(OUTPUT_BASE_PATH, file_id_with_suffix)


            jpg_path = os.path.join(output_dir, 'img.jpg')
            flag = crop_and_save(img, x, y, radius, jpg_path)
        else:
            print(f"DICOM file for {file_id} not found.")


global_counter = {}


process_and_save(df, global_counter)

In [None]:
import os
import cv2
import numpy as np
import random

def find_cropped_image_position(full_image, cropped_image):
    res = cv2.matchTemplate(full_image, cropped_image, cv2.TM_CCOEFF_NORMED)
    _, _, _, max_loc = cv2.minMaxLoc(res)
    top_left = max_loc
    bottom_right = (top_left[0] + cropped_image.shape[1], top_left[1] + cropped_image.shape[0])
    return (*top_left, *bottom_right)

def random_crop_outside_bboxes(full_image, bboxes, crop_size, num_crops=1):
    height, width = full_image.shape[:2]
    cropped_images = []
    
    def is_inside_any_bbox(x, y):
        for (x1, y1, x2, y2) in bboxes:
            if x1 <= x < x2 and y1 <= y < y2:
                return True
        return False
    
    for _ in range(num_crops):
        while True:
            x = random.randint(0, width - crop_size[1])
            y = random.randint(0, height - crop_size[0])
            if not is_inside_any_bbox(x, y):
                cropped_img = full_image[y:y + crop_size[0], x:x + crop_size[1]]
                cropped_images.append(cropped_img)
                break
    return cropped_images

def process_all_crops_for_folder(full_image_path, cropped_folders, output_dir):
    full_image = cv2.imread(full_image_path+'/img.jpg')
    bboxes = []

    # Collect bboxes from all crops
    try:
        for folder in cropped_folders:
            for cropped_image_name in os.listdir(folder):
                if cropped_image_name.endswith('img.jpg'):
                    cropped_image_path = os.path.join(folder, cropped_image_name)
                    cropped_image = cv2.imread(cropped_image_path)
                    bbox = find_cropped_image_position(full_image, cropped_image)
                    bboxes.append(bbox)

        # Generate random crops and update info_dict.npy
        for idx,folder in enumerate(cropped_folders):
            cropped_image_name=os.path.join(folder, 'img.jpg')
            crop_size = (bboxes[idx][3] - bboxes[idx][1], bboxes[idx][2] - bboxes[idx][0])
            random_crops = random_crop_outside_bboxes(full_image, bboxes, crop_size)
            for crop in random_crops:
                normal_folder = os.path.join(output_dir, f"{os.path.basename(folder)}_normal")
                os.makedirs(normal_folder, exist_ok=True)
                normal_image_path = os.path.join(normal_folder, 'img.jpg')
                cv2.imwrite(normal_image_path, crop)
    except Exception as e:
        print(f"An error occurred: {e}")
        print(full_image_path+' OOps!')
        

root_dir = 'Benchmark/DMID-finding'
base_image_dir = 'Benchmark/DMID-breast'
output_dir = 'Benchmark/DMID-finding'

def find_full_image_path(base_dir, full_image_key):

    full_image_path = os.path.join(base_dir, full_image_key)
    if os.path.exists(full_image_path):
        return full_image_path
    else:
        return None

# Map cropped images to their respective full images
full_image_to_crops_map = {}
for subdir in os.listdir(root_dir):
    full_image_key = '_'.join(subdir.split('_')[:-1])
    full_image_to_crops_map.setdefault(full_image_key, []).append(os.path.join(root_dir, subdir))

# Process all mapped cropped folders for each full image
for full_image_key, folders in full_image_to_crops_map.items():
    # patient_id, side, view = full_image_key.split('_')
    full_image_path = find_full_image_path(base_image_dir, full_image_key)
    if full_image_path:
        process_all_crops_for_folder(full_image_path, folders, output_dir)
    else:
        print(f"Full image not found for {full_image_key}")
