VinDr-Mammo-breast

In [None]:
import pandas as pd
import numpy as np
import os
import pydicom
import cv2
from tqdm import tqdm
import ast
from pydicom.pixel_data_handlers.util import apply_voi_lut

def fit_image(fname):
    """Process DICOM image to extract breast region"""
    dicom = pydicom.dcmread(fname)
    X = apply_voi_lut(dicom.pixel_array, dicom, prefer_lut=False)
    
    X = (X - X.min()) / (X.max() - X.min())
    
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        X = 1 - X
    
    X = X * 255
    
    # Remove narrow exterior frames
    X = X[10:-10, 10:-10]
    
    # Find breast region using connected components
    output = cv2.connectedComponentsWithStats((X > 20).astype(np.uint8), 8, cv2.CV_32S)
    stats = output[2]
    
    # Find largest region (breast)
    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h
    
    return X[y1:y2, x1:x2]

# Read the CSV file
csv_path = '/Volumes/图图/VinDr/VinDr-Mammo/finding_annotations.csv'
df = pd.read_csv(csv_path)

# Convert 'finding_categories' from string to list
df['finding_categories'] = df['finding_categories'].apply(ast.literal_eval)

# Group by 'image_id' and merge 'finding_categories' lists
df_grouped = df.groupby('image_id').agg({
    'study_id': 'first',
    'laterality': 'first',
    'view_position': 'first',
    'breast_birads': 'first',
    'breast_density': 'first',
    'finding_categories': lambda x: list(set([item for sublist in x for item in sublist]))
}).reset_index()

# Folder paths
base_path = '/Volumes/图图/VinDr/VinDr-Mammo/images'
target_path = 'Benchmark/VinDr-Mammo-breast'

# Create target directory if it doesn't exist
os.makedirs(target_path, exist_ok=True)

# Process each image
for idx, row in tqdm(df_grouped.iterrows(), total=len(df_grouped)):
    study_id = row['study_id']
    image_id = row['image_id']
    
    img_dir = os.path.join(base_path, study_id)
    img_path = os.path.join(img_dir, image_id + '.dicom')
    
    if not os.path.exists(img_path):
        print(f"Image not found: {img_path}")
        continue

    # Process image
    img = fit_image(img_path)

    # Create output folder for this image
    img_folder = os.path.join(target_path, image_id)
    os.makedirs(img_folder, exist_ok=True)
    
    # Save processed image
    jpg_path = os.path.join(img_folder, 'img.jpg')
    cv2.imwrite(jpg_path, img)

    print(f'Processed {image_id} to {img_folder}')

print("Processing complete.")

VinDr-Mammo-finding

In [None]:
import pandas as pd
import numpy as np
import os
import pydicom
import cv2
from tqdm import tqdm
import ast
from pydicom.pixel_data_handlers.util import apply_voi_lut

def fit_image(fname):
    """Process DICOM image to extract pixel data"""
    dicom = pydicom.dcmread(fname)
    X = apply_voi_lut(dicom.pixel_array, dicom, prefer_lut=False)
    X = (X - X.min()) / (X.max() - X.min())
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        X = 1 - X
    X = X * 255
    return X

def crop_image(X, xmin, ymin, xmax, ymax):
    """Crop image to specified bounding box coordinates"""
    xmin = max(0, xmin)
    ymin = max(0, ymin)
    xmax = min(X.shape[1], xmax)
    ymax = min(X.shape[0], ymax)
    return X[ymin:ymax, xmin:xmax]

# Read CSV file
csv_path = '/Volumes/图图/VinDr/VinDr-Mammo/finding_annotations.csv'
df = pd.read_csv(csv_path)

# Convert finding_categories from string to list and filter out 'No Finding' cases
df['finding_categories'] = df['finding_categories'].apply(ast.literal_eval)
df = df[df['finding_categories'].apply(lambda x: x != ['No Finding'])]

# Group by image_id
grouped_df = df.groupby('image_id')

# Define paths
base_path = '/Volumes/图图/VinDr/VinDr-Mammo/images'
target_path = 'Benchmark/VinDr-Mammo-finding'

# Create target folder
os.makedirs(target_path, exist_ok=True)

# Process each image and its bounding boxes
for image_id, group in tqdm(grouped_df):
    study_id = group['study_id'].iloc[0]
    img_dir = os.path.join(base_path, study_id)
    img_path = os.path.join(img_dir, image_id + '.dicom')

    if not os.path.exists(img_path):
        print(f"Image not found: {img_path}")
        continue

    # Process the full DICOM image
    img = fit_image(img_path)
    
    # Process each bounding box in this image
    for i, row in enumerate(group.itertuples()):
        # Crop to the bounding box
        xmin, ymin, xmax, ymax = int(row.xmin), int(row.ymin), int(row.xmax), int(row.ymax)
        cropped_img = crop_image(img, xmin, ymin, xmax, ymax)
        
        # Create output folder for this finding
        img_folder = os.path.join(target_path, f"{image_id}_{i}")
        os.makedirs(img_folder, exist_ok=True)
        
        # Save cropped image
        jpg_path = os.path.join(img_folder, 'img.jpg')
        cv2.imwrite(jpg_path, cropped_img)


        print(f'Processed {image_id}_{i} to {img_folder}')

print("Processing complete.")

generate normal case

In [None]:
import os
import cv2
import numpy as np
import random

def find_cropped_image_position(full_image, cropped_image):
    res = cv2.matchTemplate(full_image, cropped_image, cv2.TM_CCOEFF_NORMED)
    _, _, _, max_loc = cv2.minMaxLoc(res)
    top_left = max_loc
    bottom_right = (top_left[0] + cropped_image.shape[1], top_left[1] + cropped_image.shape[0])
    return (*top_left, *bottom_right)

def random_crop_outside_bboxes(full_image, bboxes, crop_size, num_crops=1):
    height, width = full_image.shape[:2]
    cropped_images = []
    
    def is_inside_any_bbox(x, y):
        for (x1, y1, x2, y2) in bboxes:
            if x1 <= x < x2 and y1 <= y < y2:
                return True
        return False
    
    for _ in range(num_crops):
        while True:
            x = random.randint(0, width - crop_size[1])
            y = random.randint(0, height - crop_size[0])
            if not is_inside_any_bbox(x, y):
                cropped_img = full_image[y:y + crop_size[0], x:x + crop_size[1]]
                cropped_images.append(cropped_img)
                break
    return cropped_images

def process_all_crops_for_folder(full_image_path, cropped_folders, output_dir):
    print(full_image_path)
    full_image = cv2.imread(full_image_path + '/img.jpg')
    if full_image is None:
        print(f"Failed to load full image from {full_image_path}")
        return

    bboxes = []

    try:
        for folder in cropped_folders:
            # for cropped_image_name in os.listdir(folder):
            cropped_image_name=os.path.join(folder, 'img.jpg')
            # if cropped_image_name.endswith('img.jpg'):
            cropped_image_path = os.path.join(folder, cropped_image_name)
            cropped_image = cv2.imread(cropped_image_path)
            if cropped_image is not None:
                bbox = find_cropped_image_position(full_image, cropped_image)
                bboxes.append(bbox)
            else:
                print(f"Failed to load cropped image from {cropped_image_path}")

        for idx, folder in enumerate(cropped_folders):
            crop_size = (bboxes[idx][3] - bboxes[idx][1], bboxes[idx][2] - bboxes[idx][0])
            random_crops = random_crop_outside_bboxes(full_image, bboxes, crop_size)
            for crop in random_crops:
                normal_folder = os.path.join(output_dir, f"{os.path.basename(folder)}_normal")
                os.makedirs(normal_folder, exist_ok=True)
                normal_image_path = os.path.join(normal_folder, 'img.jpg')
                cv2.imwrite(normal_image_path, crop)
    except Exception as e:
        print(f"An error occurred processing {folder}: {e}")

root_dir = 'Benchmark/VinDr-Mammo-finding'
base_image_dir = 'Benchmark/VinDr-Mammo-breast'
output_dir = 'Benchmark/VinDr-Mammo-finding'

def find_full_image_path(base_dir, full_image_key):
    full_image_path = os.path.join(base_dir, full_image_key)
    if os.path.exists(full_image_path):
        return full_image_path
    else:
        return None

# Map cropped images to their respective full images
full_image_to_crops_map = {}
for subdir in os.listdir(root_dir):
    full_image_key = '_'.join(subdir.split('_')[:-1])
    full_image_to_crops_map.setdefault(full_image_key, []).append(os.path.join(root_dir, subdir))

# Process all mapped cropped folders for each full image
for full_image_key, folders in full_image_to_crops_map.items():
    print(f"Processing: {full_image_key}")
    full_image_path = find_full_image_path(base_image_dir, full_image_key)
    if full_image_path:
        process_all_crops_for_folder(full_image_path, folders, output_dir)
    else:
        print(f"Full image not found for {full_image_key}")
