In [None]:
import os
import pandas as pd
import pydicom
from PIL import Image
import numpy as np
import cv2

metadata_csv = '/Volumes/Newsmy/EMBED/tables/EMBED_OpenData_metadata.csv'
clinical_csv = '/Volumes/Newsmy/EMBED/tables/EMBED_OpenData_clinical_reduced.csv'
target_folder = 'Benchmark/EMBED' 

metadata_df = pd.read_csv(metadata_csv)
clinical_df = pd.read_csv(clinical_csv)
print('read done')

from pydicom.pixel_data_handlers.util import apply_voi_lut

def fit_image(fname):
    dicom = pydicom.dcmread(fname)
    if 'PixelData' in dicom:
        X = apply_voi_lut(dicom.pixel_array, dicom, prefer_lut=False)
    else:
        print(f"No PixelData found in DICOM file: {fname}")
        return None

    X = (X - X.min()) / (X.max() - X.min())

    if dicom.PhotometricInterpretation == "MONOCHROME1":
        X = 1 - X

    X = X * 255
    X = X[10:-10, 10:-10]

    output = cv2.connectedComponentsWithStats((X > 20).astype(np.uint8), 8, cv2.CV_32S)
    stats = output[2]
    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h
    X_fit = X[y1: y2, x1: x2]
    return X_fit

def convert_dcm_to_jpg(dicom_path, output_path, log_file):
    image = fit_image(dicom_path)
    if image is not None:
        cv2.imwrite(output_path, image)
        print('saved ' + output_path)
        with open(log_file, 'a') as f:
            f.write(os.path.basename(dicom_path) + '\n')

log_file = os.path.join(target_folder, 'saved_images.txt') 

for idx, row in metadata_df.iterrows():
    acc_anon = str(int(float(row['acc_anon']))).strip()
    png_filename = row['png_filename'].strip()
    dicom_path = row['anon_dicom_path'].replace('/mnt/NAS2/mammo/anon_dicom', '/Volumes/Newsmy/EMBED/images').strip()

    jpg_filename = os.path.splitext(png_filename)[0] + '.jpg'

    acc_folder = os.path.join(target_folder, acc_anon)
    os.makedirs(acc_folder, exist_ok=True)

    jpg_output_path = os.path.join(acc_folder, jpg_filename)
    convert_dcm_to_jpg(dicom_path, jpg_output_path, log_file)


print("finish")