image classification

In [2]:
import os
import numpy as np
import pandas as pd
import pydicom as pdcm
import cv2
from sklearn.model_selection import train_test_split

def preprocess(img):
    # 找到原图中所有为0的行和列
    rows_to_keep = np.any(img != 0, axis=1)
    cols_to_keep = np.any(img != 0, axis=0)
    # print(cols_to_keep)

    # 删除原图中全部为0的行和列
    img = img[rows_to_keep][:, cols_to_keep]
    return img

# 读取XLSX文件
xlsx_path = '/Volumes/图图/DMID-kaggle/archive/Metadata.xlsx'
df = pd.read_excel(xlsx_path)
# df['pathology'] = df['pathology'].fillna('N')

# 排序并按8:2比例分成训练集和测试集
unique_ids = df['ID'].drop_duplicates()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)

# 根据去重后的ID划分数据集
train_df = df[df['ID'].isin(train_ids)]
test_df = df[df['ID'].isin(test_ids)]

# 定义输入和输出路径
DCM_PATH = "/Volumes/图图/DMID-kaggle/archive/DICOM Images/DICOM Images"
OUTPUT_BASE_PATH = "/Volumes/图图/DMID-kaggle/archive/image-classification"

def process_and_save(df, set_type):
    for index, row in df.iterrows():
        file_id = row['ID'].strip()
        background_tissue = str(row['background tissue']).replace(' ','')
        # abnormality = row['abnormality']
        # pathology = row['pathology']
        meta_data = {
            'background_tissue': background_tissue
            # 'abnormality': abnormality,
            # 'pathology': pathology
        }
        
        dcm_file = os.path.join(DCM_PATH, f"{file_id}.dcm")
        if os.path.exists(dcm_file):
            dcm = pdcm.dcmread(dcm_file)
            img = dcm.pixel_array
            if len(img.shape) == 3 and img.shape[2] == 3:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img = preprocess(img)
            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
            # 创建输出目录
            output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, file_id)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            # 保存预处理后的图像为JPG文件
            jpg_path = os.path.join(output_dir, 'img.jpg')
            cv2.imwrite(jpg_path, img)
            
            # 保存元数据为NPY文件
            npy_path = os.path.join(output_dir, 'info_dict.npy')
            np.save(npy_path, meta_data)

            print(f"Processed {file_id} for {set_type} set")
        else:
            print(f"DICOM file for {file_id} not found.")

# 处理并保存训练集和测试集
process_and_save(train_df, 'Train')
process_and_save(test_df, 'Test')


Processed IMG002 for Train set
Processed IMG004 for Train set
Processed IMG005 for Train set
Processed IMG006 for Train set
Processed IMG007 for Train set
Processed IMG008 for Train set
Processed IMG009 for Train set
Processed IMG011 for Train set
Processed IMG011 for Train set
Processed IMG013 for Train set
Processed IMG014 for Train set
Processed IMG015 for Train set
Processed IMG016 for Train set
Processed IMG017 for Train set
Processed IMG018 for Train set
Processed IMG020 for Train set
Processed IMG021 for Train set
Processed IMG022 for Train set
Processed IMG024 for Train set
Processed IMG024 for Train set
Processed IMG025 for Train set
Processed IMG025 for Train set
Processed IMG026 for Train set
Processed IMG027 for Train set
Processed IMG028 for Train set
Processed IMG028 for Train set
Processed IMG029 for Train set
Processed IMG029 for Train set
Processed IMG029 for Train set
Processed IMG030 for Train set
Processed IMG030 for Train set
Processed IMG032 for Train set
Processe

cropped classification

In [5]:
import os
import numpy as np
import pandas as pd
import pydicom as pdcm
import cv2
from sklearn.model_selection import train_test_split

# def preprocess(img):
#     # 找到原图中所有为0的行和列
#     rows_to_keep = np.any(img != 0, axis=1)
#     cols_to_keep = np.any(img != 0, axis=0)
    
#     # 删除原图中全部为0的行和列
#     img = img[rows_to_keep][:, cols_to_keep]
#     return img

def crop_and_save(img, x, y, radius, output_path,file_id):
    flag=True
    # 确保裁剪区域不超出图像边界
    try:
        x = int(x)
        y = int(y)
        radius = int(radius)
        x1 = max(x - radius, 0)
        y1 = max(y - radius, 0)
        x2 = min(x + radius, img.shape[1])
        y2 = min(y + radius, img.shape[0])

        # 裁剪图像
        cropped_img = img[y1:y2, x1:x2]
        cropped_img = cv2.normalize(cropped_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        cv2.imwrite(output_path, cropped_img)
    except:
        flag=False
    return flag

# 读取XLSX文件
xlsx_path = '/Volumes/图图/DMID-kaggle/archive/Metadata.xlsx'
df = pd.read_excel(xlsx_path)
df = df[df['abnormality'] != 'NORM']

# 填充空的 pathology 列为 'N'
# df['pathology'] = df['pathology'].fillna('N')

# 排序并按8:2比例分成训练集和测试集
df_sorted = df.sort_values(by='ID')
train_df, test_df = train_test_split(df_sorted, test_size=0.2, random_state=42)

# 定义输入和输出路径
DCM_PATH = "/Volumes/图图/DMID-kaggle/archive/DICOM Images/DICOM Images"
OUTPUT_BASE_PATH = "/Volumes/图图/DMID-kaggle/archive/cropped-classification"

def process_and_save(df, set_type):
    id_counter = {}
    for index, row in df.iterrows():
        file_id = row['ID'].strip()
        abnormality = str(row['abnormality']).replace(' ','')
        pathology = str(row['pathology']).replace(' ','')
        x = row['x']
        y = row['y']
        radius = row['radius']
        meta_data = {
            'abnormality': abnormality,
            'pathology': pathology
        }
        
        dcm_file = os.path.join(DCM_PATH, f"{file_id}.dcm")
        if os.path.exists(dcm_file):
            dcm = pdcm.dcmread(dcm_file)
            img = dcm.pixel_array
            if len(img.shape) == 3 and img.shape[2] == 3:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            # img = preprocess(img)
            # 更新ID计数器
            if file_id in id_counter:
                id_counter[file_id] += 1
            else:
                id_counter[file_id] = 1
            file_id_with_suffix = f"{file_id}_{id_counter[file_id]}"

            if abnormality != 'NORM':
                # 创建输出目录
                output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, file_id_with_suffix)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                # 裁剪并保存预处理后的图像为JPG文件
                jpg_path = os.path.join(output_dir, 'img.jpg')
                
                flag=crop_and_save(img, x, y, radius, jpg_path,file_id)
                

                # 保存元数据为NPY文件
                if flag==True:
                    npy_path = os.path.join(output_dir, 'info_dict.npy')
                    np.save(npy_path, meta_data)

                    print(f"Processed {file_id} for {set_type} set")
            else:
                print(f"Skipped {file_id} as it is 'NORM'")
        else:
            print(f"DICOM file for {file_id} not found.")

# 处理并保存训练集和测试集
process_and_save(train_df, 'Train')
process_and_save(test_df, 'Test')


Processed IMG024 for Train set
Processed IMG067 for Train set
Processed IMG206 for Train set
Processed IMG305 for Train set
Processed IMG033 for Train set
Processed IMG085 for Train set
Processed IMG369 for Train set
Processed IMG421 for Train set
Processed IMG232 for Train set
Processed IMG355 for Train set
Processed IMG101 for Train set
Processed IMG297 for Train set
Processed IMG279 for Train set
Processed IMG451 for Train set
Processed IMG407 for Train set
Processed IMG205 for Train set
Processed IMG114 for Train set
Processed IMG500 for Train set
Processed IMG011 for Train set
Processed IMG053 for Train set
Processed IMG172 for Train set
Processed IMG465 for Train set
Processed IMG446 for Train set
Processed IMG342 for Train set
Processed IMG024 for Train set
Processed IMG482 for Train set
Processed IMG136 for Train set
Processed IMG407 for Train set
Processed IMG010 for Train set
Processed IMG025 for Train set
Processed IMG304 for Train set
Processed IMG063 for Train set
Processe