CMMD1

In [None]:
import os
import numpy as np
import pandas as pd
import pydicom as pdcm
import cv2
from sklearn.model_selection import train_test_split

def preprocess(img):
    # 找到原图中所有为0的行和列
    rows_to_keep = np.any(img != 0, axis=1)
    cols_to_keep = np.any(img != 0, axis=0)

    # 删除原图中全部为0的行和列
    img = img[rows_to_keep][:, cols_to_keep]

    return img

# 读取XLSX文件
xlsx_path = '/Volumes/图图/CMMD/CMMD_clinicaldata_revision.xlsx'
df = pd.read_excel(xlsx_path)
# df['subtype'] = df['subtype'].fillna('-')

# 排序并按8:2比例分成训练集和测试集
df_sorted = df.sort_values(by='ID1')
train_df, test_df = train_test_split(df_sorted, test_size=0.2, random_state=42)

# 定义输入和输出路径
DATA_PATH = "/Volumes/图图/CMMD/manifest-1616439774456/CMMD"
OUTPUT_BASE_PATH = "/Volumes/图图/CMMD/manifest-1616439774456/classification1"

def process_and_save(df, set_type):
    for index, row in df.iterrows():
        folder_name = row['ID1']
        left_right = row['LeftRight']
        abnormality = str(row['abnormality']).replace(' ','')
        classification = str(row['classification']).replace(' ','')
        meta_data = {
            'abnormality': abnormality,
            'classification': classification
        }

        folder_path = os.path.join(DATA_PATH, folder_name)
        if os.path.exists(folder_path):
            # 读取文件夹中的DICOM文件
            dcm_files = []
            for root, dirs, files in os.walk(folder_path):
                for file in files:
                    if file.endswith('.dcm') and not file.startswith('._'):
                        dcm_files.append(os.path.join(root, file))
            
            if len(dcm_files) not in [2, 4]:
                print(f"Expected 2 or 4 DICOM files in {folder_path}, but found {len(dcm_files)}")
                continue

            if len(dcm_files) == 2:
                output_subdir = f"{folder_name}-{left_right}"
                output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, output_subdir)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                for i, dcm_file in enumerate(dcm_files):
                    dcm = pdcm.dcmread(dcm_file)
                    img = dcm.pixel_array
                    img = preprocess(img)
                    img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

                    jpg_path = os.path.join(output_dir, f'img{i+1}.jpg')
                    cv2.imwrite(jpg_path, img)

                npy_path = os.path.join(output_dir, 'info_dict.npy')
                np.save(npy_path, meta_data)
                print(f"Processed {folder_name} for {set_type} set")

            elif len(dcm_files) == 4:
                if left_right == 'L':
                    relevant_files = ['1-1.dcm', '1-2.dcm']
                elif left_right == 'R':
                    relevant_files = ['1-3.dcm', '1-4.dcm']
                else:
                    print(f"Unexpected value in LeftRight column: {left_right}")
                    continue

                output_subdir = f"{folder_name}-{left_right}"
                output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, output_subdir)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                for dcm_file in dcm_files:
                    if os.path.basename(dcm_file) in relevant_files:
                        dcm = pdcm.dcmread(dcm_file)
                        img = dcm.pixel_array
                        img = preprocess(img)
                        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

                        file_index = relevant_files.index(os.path.basename(dcm_file)) + 1
                        jpg_path = os.path.join(output_dir, f'img{file_index}.jpg')
                        cv2.imwrite(jpg_path, img)

                npy_path = os.path.join(output_dir, 'info_dict.npy')
                np.save(npy_path, meta_data)
                print(f"Processed {folder_name} for {set_type} set")

        else:
            print(f"Folder {folder_name} not found.")

# 处理并保存训练集和测试集
process_and_save(train_df, 'Train')
process_and_save(test_df, 'Test')


CMMD2

In [2]:
import os
import numpy as np
import pandas as pd
import pydicom as pdcm
import cv2
from sklearn.model_selection import train_test_split

def preprocess(img):
    # 找到原图中所有为0的行和列
    rows_to_keep = np.any(img != 0, axis=1)
    cols_to_keep = np.any(img != 0, axis=0)

    # 删除原图中全部为0的行和列
    img = img[rows_to_keep][:, cols_to_keep]

    return img

# 读取XLSX文件
xlsx_path = '/Volumes/图图/CMMD/CMMD_clinicaldata_revision.xlsx'
df = pd.read_excel(xlsx_path)
df['subtype'] = df['subtype'].fillna('-')

# 排序并按8:2比例分成训练集和测试集
df_sorted = df.sort_values(by='ID1')
train_df, test_df = train_test_split(df_sorted, test_size=0.2, random_state=42)

# 定义输入和输出路径
DATA_PATH = "/Volumes/图图/CMMD/manifest-1616439774456/CMMD"
OUTPUT_BASE_PATH = "/Volumes/图图/CMMD/manifest-1616439774456/classification2"

def process_and_save(df, set_type):
    for index, row in df.iterrows():
        folder_name = row['ID1']
        left_right = row['LeftRight']
        # abnormality = str(row['abnormality']).replace(' ','')
        # classification = str(row['classification']).replace(' ','')
        subtype = str(row['subtype']).replace(' ','')
        if subtype!='-':
            meta_data = {
                # 'abnormality': abnormality,
                # 'classification': classification
                'subtype': subtype
            }

            folder_path = os.path.join(DATA_PATH, folder_name)
            if os.path.exists(folder_path):
                # 读取文件夹中的DICOM文件
                dcm_files = []
                for root, dirs, files in os.walk(folder_path):
                    for file in files:
                        if file.endswith('.dcm') and not file.startswith('._'):
                            dcm_files.append(os.path.join(root, file))
                
                if len(dcm_files) not in [2, 4]:
                    print(f"Expected 2 or 4 DICOM files in {folder_path}, but found {len(dcm_files)}")
                    continue

                if len(dcm_files) == 2:
                    output_subdir = f"{folder_name}-{left_right}"
                    output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, output_subdir)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)

                    for i, dcm_file in enumerate(dcm_files):
                        dcm = pdcm.dcmread(dcm_file)
                        img = dcm.pixel_array
                        img = preprocess(img)
                        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

                        jpg_path = os.path.join(output_dir, f'img{i+1}.jpg')
                        cv2.imwrite(jpg_path, img)

                    npy_path = os.path.join(output_dir, 'info_dict.npy')
                    np.save(npy_path, meta_data)
                    print(f"Processed {folder_name} for {set_type} set")

                elif len(dcm_files) == 4:
                    if left_right == 'L':
                        relevant_files = ['1-1.dcm', '1-2.dcm']
                    elif left_right == 'R':
                        relevant_files = ['1-3.dcm', '1-4.dcm']
                    else:
                        print(f"Unexpected value in LeftRight column: {left_right}")
                        continue

                    output_subdir = f"{folder_name}-{left_right}"
                    output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, output_subdir)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)

                    for dcm_file in dcm_files:
                        if os.path.basename(dcm_file) in relevant_files:
                            dcm = pdcm.dcmread(dcm_file)
                            img = dcm.pixel_array
                            img = preprocess(img)
                            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

                            file_index = relevant_files.index(os.path.basename(dcm_file)) + 1
                            jpg_path = os.path.join(output_dir, f'img{file_index}.jpg')
                            cv2.imwrite(jpg_path, img)

                    npy_path = os.path.join(output_dir, 'info_dict.npy')
                    np.save(npy_path, meta_data)
                    print(f"Processed {folder_name} for {set_type} set")

            else:
                print(f"Folder {folder_name} not found.")

# 处理并保存训练集和测试集
process_and_save(train_df, 'Train')
process_and_save(test_df, 'Test')


Processed D2-0305 for Train set
Processed D2-0471 for Train set
Processed D2-0229 for Train set
Processed D2-0062 for Train set
Processed D2-0276 for Train set
Processed D2-0248 for Train set
Processed D2-0729 for Train set
Processed D2-0111 for Train set
Processed D2-0476 for Train set
Processed D2-0165 for Train set
Processed D2-0156 for Train set
Processed D2-0227 for Train set
Processed D2-0636 for Train set
Processed D2-0434 for Train set
Processed D2-0685 for Train set
Processed D2-0236 for Train set
Processed D2-0556 for Train set
Processed D2-0114 for Train set
Processed D2-0701 for Train set
Processed D2-0597 for Train set
Processed D2-0423 for Train set
Processed D2-0676 for Train set
Processed D2-0359 for Train set
Processed D2-0172 for Train set
Processed D2-0302 for Train set
Processed D2-0681 for Train set
Processed D2-0026 for Train set
Processed D2-0104 for Train set
Processed D2-0070 for Train set
Processed D2-0594 for Train set
Processed D2-0400 for Train set
Processe