In [7]:
import os
import shutil
import numpy as np
import random
import cv2

def preprocess(img):
    # 找到原图中所有为0的行和列
    rows_to_keep = np.any(img != 0, axis=1)
    cols_to_keep = np.any(img != 0, axis=0)

    # 删除原图中全部为0的行和列
    img = img[rows_to_keep][:, cols_to_keep]

    return img

def split_data(source_folder, target_folder, train_ratio=0.8):
    # 获取所有的类别文件夹
    categories = ['Birad1', 'Birad3', 'Birad4', 'Birad5']
    
    # 创建目标文件夹
    train_folder = os.path.join(target_folder, 'Train')
    test_folder = os.path.join(target_folder, 'Test')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)
    
    all_image_files = []
    saved_files = []

    for category in categories:
        category_folder = os.path.join(source_folder, category)
        image_files = []
        
        # 遍历类别文件夹中的所有jpg文件
        for root, dirs, files in os.walk(category_folder):
            for file in files:
                if file.endswith('.jpg') and not file.startswith('._'):
                    relative_path = os.path.relpath(os.path.join(root, file), source_folder)
                    image_files.append(relative_path)
        
        # 打乱文件列表并划分为训练集和测试集
        print(f'{category} contains {len(image_files)} images.')
        random.shuffle(image_files)
        train_count = int(len(image_files) * train_ratio)
        
        train_files = image_files[:train_count]
        test_files = image_files[train_count:]
        all_image_files.extend(image_files)  # Collect all image paths
        
        # 处理训练集文件
        for file_path in train_files:
            file_name = os.path.splitext(os.path.basename(file_path))[0].replace(' ', '')
            if file_name!='2018_BC005421_CC_R' and file_name!='2018_BC005421_MLO_R' and file_name!='2018_BC0022482_CC_R' and file_name!='2018_BC0022482_MLO_R':
                target_folder_path = os.path.join(train_folder, file_name)
                if not os.path.exists(target_folder_path):
                    os.makedirs(target_folder_path)
                target_img_path = os.path.join(target_folder_path, 'img.jpg')
                target_info_path = os.path.join(target_folder_path, 'info_dict.npy')
                
                # 拷贝图片
                image = cv2.imread(os.path.join(source_folder, file_path), cv2.IMREAD_GRAYSCALE)
                processed_image = preprocess(image)
                cv2.imwrite(target_img_path, processed_image)
                saved_files.append(file_path)
                
                # 保存类别信息
                info_dict = {'Birad': str(category[-1]).replace(' ', '')}
                np.save(target_info_path, info_dict)
                print(target_folder_path + ' has been saved')
        
        # 处理测试集文件
        for file_path in test_files:
            file_name = os.path.splitext(os.path.basename(file_path))[0].replace(' ', '')
            if file_name!='2018_BC005421_CC_R' and file_name!='2018_BC005421_MLO_R' and file_name!='2018_BC0022482_CC_R' and file_name!='2018_BC0022482_MLO_R':
                target_folder_path = os.path.join(test_folder, file_name)
                if not os.path.exists(target_folder_path):
                    os.makedirs(target_folder_path)
                target_img_path = os.path.join(target_folder_path, 'img.jpg')
                target_info_path = os.path.join(target_folder_path, 'info_dict.npy')
                
                # 拷贝图片
                image = cv2.imread(os.path.join(source_folder, file_path), cv2.IMREAD_GRAYSCALE)
                processed_image = preprocess(image)
                cv2.imwrite(target_img_path, processed_image)
                saved_files.append(file_path)
                
                # 保存类别信息
                info_dict = {'Birad': str(category[-1]).replace(' ', '')}
                np.save(target_info_path, info_dict)
                print(target_folder_path + ' has been saved')
            
    print("Data splitting complete.")

if __name__ == "__main__":
    source_folder = "/Volumes/图图/KAU-BCMD/archive"
    target_folder = "/Volumes/图图/KAU-BCMD/archive/classification"
    
    split_data(source_folder, target_folder)


Birad1 contains 1865 images.
/Volumes/图图/KAU-BCMD/archive/classification/Train/2018_BC0021785_CC_L has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2017_BC019701_MLO_L has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2013_BC002061_MLO_L has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2013_BC010241_CC_L has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2014_BC011962_CC_L has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2017_BC0020721_CC_R has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2018_BC0021421_CC_L has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2017_BC003842_MLO_R has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2013_BC010781_CC_L has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2019_BC015743_CC_R has been saved
/Volumes/图图/KAU-BCMD/archive/classification/Train/2017_BC019863_MLO_L has been saved
/Volumes/图图/KAU-BCMD/archive/classificat