image-classification

In [2]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

def preprocess(img):
    # 找到原图中所有为0的行和列
    rows_to_keep = np.any(img != 0, axis=1)
    cols_to_keep = np.any(img != 0, axis=0)

    # 删除原图中全部为0的行和列
    img = img[rows_to_keep][:, cols_to_keep]

    return img

# 定义路径
TXT_PATH = '/Volumes/图图/MIAS/archive/Info.txt'
PGM_PATH = '/Volumes/图图/MIAS/archive/all-mias'
OUTPUT_BASE_PATH = '/Volumes/图图/MIAS/archive/image-classification'

# 读取TXT文件
with open(TXT_PATH, 'r') as file:
    lines = file.readlines()

# 跳过标题行
lines = lines[1:]

refnums = [line.split()[0] for line in lines]
unique_refnums = list(set(refnums))

# 将去重后的 refnum 按8:2分为训练集和测试集
train_refnums, test_refnums = train_test_split(unique_refnums, test_size=0.2, random_state=42)

# 根据去重后的 refnum 划分原始数据集
train_lines = [line for line in lines if line.split()[0] in train_refnums]
test_lines = [line for line in lines if line.split()[0] in test_refnums]

def process_and_save(lines, set_type):
    for line in lines:
        parts = line.split()
        refnum = parts[0]
        bg = str(parts[1]).replace(' ','')
        # cls = parts[2]
        # severity = parts[3] if len(parts) > 3 else 'N'

        # 读取PGM文件
        pgm_file = os.path.join(PGM_PATH, refnum + '.pgm')
        if os.path.exists(pgm_file):
            img = cv2.imread(pgm_file, cv2.IMREAD_GRAYSCALE)
            img = preprocess(img)
            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

            # 创建输出目录
            output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, refnum)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            # 保存预处理后的图像为JPEG格式
            jpg_output_path = os.path.join(output_dir, 'img.jpg')
            cv2.imwrite(jpg_output_path, img)

            # 保存标签为NumPy文件字典
            info = {
                'background': bg
                # 'class': cls,
                # 'severity': severity
            }
            npy_path = os.path.join(output_dir, 'info_dict.npy')
            np.save(npy_path, info)

            print(f"Processed {refnum} for {set_type} set")
        else:
            print(f"PGM file for {refnum} not found.")

# 处理并保存训练集和测试集
process_and_save(train_lines, 'Train')
process_and_save(test_lines, 'Test')


Processed mdb001 for Train set
Processed mdb002 for Train set
Processed mdb003 for Train set
Processed mdb004 for Train set
Processed mdb005 for Train set
Processed mdb005 for Train set
Processed mdb006 for Train set
Processed mdb007 for Train set
Processed mdb008 for Train set
Processed mdb009 for Train set
Processed mdb010 for Train set
Processed mdb011 for Train set
Processed mdb012 for Train set
Processed mdb013 for Train set
Processed mdb014 for Train set
Processed mdb016 for Train set
Processed mdb018 for Train set
Processed mdb019 for Train set
Processed mdb020 for Train set
Processed mdb022 for Train set
Processed mdb023 for Train set
Processed mdb024 for Train set
Processed mdb025 for Train set
Processed mdb026 for Train set
Processed mdb027 for Train set
Processed mdb028 for Train set
Processed mdb029 for Train set
Processed mdb031 for Train set
Processed mdb033 for Train set
Processed mdb035 for Train set
Processed mdb037 for Train set
Processed mdb039 for Train set
Processe

cropped-classification

In [5]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

def crop_and_save(img, x, y, radius, output_path):
    # 确保裁剪区域不超出图像边界
    x = int(x)
    y = int(y)
    radius = int(radius)
    x1 = max(x - radius, 0)
    y1 = max(y - radius, 0)
    x2 = min(x + radius, img.shape[1])
    y2 = min(y + radius, img.shape[0])

    # 裁剪图像
    cropped_img = img[y1:y2, x1:x2]
    cropped_img = cv2.normalize(cropped_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    
    # 保存裁剪后的图像
    cv2.imwrite(output_path, cropped_img)


# 定义路径
TXT_PATH = '/Volumes/图图/MIAS/archive/Info.txt'
PGM_PATH = '/Volumes/图图/MIAS/archive/all-mias'
OUTPUT_BASE_PATH = '/Volumes/图图/MIAS/archive/cropped-classification'

# 读取TXT文件
with open(TXT_PATH, 'r') as file:
    lines = file.readlines()

# 跳过标题行
lines = lines[1:]
lines = [line for line in lines if len(line.split()) > 4]

train_lines, test_lines = train_test_split(lines, test_size=0.2, random_state=42)

def process_and_save(lines, set_type):
    refnum_counter = {}
    for line in lines:
        # print(line)
        parts = line.split()
        refnum = parts[0]
        # bg = parts[1]
        cls = str(parts[2]).replace(' ','')
        severity = str(parts[3]).replace(' ','')
        x = parts[4]
        y = parts[5]
        radius = parts[6]
        
        if refnum in refnum_counter:
            refnum_counter[refnum] += 1
        else:
            refnum_counter[refnum] = 1
        refnum_with_suffix = f"{refnum}_{refnum_counter[refnum]}"

        # 读取PGM文件
        pgm_file = os.path.join(PGM_PATH, refnum + '.pgm')
        if os.path.exists(pgm_file):
            img = cv2.imread(pgm_file, cv2.IMREAD_GRAYSCALE)
            # img = preprocess(img)
            # img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

            # 创建输出目录
            output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, refnum_with_suffix)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            jpg_output_path = os.path.join(output_dir, 'img.jpg')
            crop_and_save(img, x, y, radius, jpg_output_path)

            # 保存标签为NumPy文件字典
            info = {
                # 'background': bg
                'class': cls,
                'severity': severity
            }
            npy_path = os.path.join(output_dir, 'info_dict.npy')
            np.save(npy_path, info)

            print(f"Processed {refnum} for {set_type} set")
        else:
            print(f"PGM file for {refnum} not found.")

# 处理并保存训练集和测试集
process_and_save(train_lines, 'Train')
process_and_save(test_lines, 'Test')


Processed mdb130 for Train set
Processed mdb025 for Train set
Processed mdb032 for Train set
Processed mdb188 for Train set
Processed mdb256 for Train set
Processed mdb090 for Train set
Processed mdb179 for Train set
Processed mdb081 for Train set
Processed mdb115 for Train set
Processed mdb152 for Train set
Processed mdb209 for Train set
Processed mdb102 for Train set
Processed mdb170 for Train set
Processed mdb191 for Train set
Processed mdb019 for Train set
Processed mdb107 for Train set
Processed mdb274 for Train set
Processed mdb184 for Train set
Processed mdb097 for Train set
Processed mdb134 for Train set
Processed mdb213 for Train set
Processed mdb012 for Train set
Processed mdb202 for Train set
Processed mdb181 for Train set
Processed mdb121 for Train set
Processed mdb142 for Train set
Processed mdb111 for Train set
Processed mdb058 for Train set
Processed mdb178 for Train set
Processed mdb110 for Train set
Processed mdb214 for Train set
Processed mdb015 for Train set
Processe

In [7]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

def crop_and_save(img, x, y, radius, output_path):
    h,w=img.shape
    # 确保裁剪区域不超出图像边界
    x = int(x)
    y = w-int(y)
    radius = int(radius)
    x1 = max(x - radius, 0)
    y1 = max(y - radius, 0)
    x2 = min(x + radius, img.shape[1])
    y2 = min(y + radius, img.shape[0])

    # 裁剪图像
    cropped_img = img[y1:y2, x1:x2]
    cropped_img = cv2.normalize(cropped_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    
    # 保存裁剪后的图像
    cv2.imwrite(output_path, cropped_img)


# 定义路径
TXT_PATH = '/Volumes/图图/MIAS/archive/Info.txt'
PGM_PATH = '/Volumes/图图/MIAS/archive/all-mias'
OUTPUT_BASE_PATH = '/Volumes/图图/MIAS/archive/cropped-classification'

# 读取TXT文件
with open(TXT_PATH, 'r') as file:
    lines = file.readlines()

# 跳过标题行
lines = lines[1:]
lines = [line for line in lines if len(line.split()) > 4]

train_lines, test_lines = train_test_split(lines, test_size=0.2, random_state=42)

def process_and_save(lines, set_type):
    refnum_counter = {}
    for line in lines:
        # print(line)
        parts = line.split()
        refnum = parts[0]
        # bg = parts[1]
        cls = str(parts[2]).replace(' ','')
        severity = str(parts[3]).replace(' ','')
        x = parts[4]
        y = parts[5]
        radius = parts[6]
        
        if refnum in refnum_counter:
            refnum_counter[refnum] += 1
        else:
            refnum_counter[refnum] = 1
        refnum_with_suffix = f"{refnum}_{refnum_counter[refnum]}"

        # 读取PGM文件
        pgm_file = os.path.join(PGM_PATH, refnum + '.pgm')
        if os.path.exists(pgm_file):
            img = cv2.imread(pgm_file, cv2.IMREAD_GRAYSCALE)
            # img = preprocess(img)
            # img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

            # 创建输出目录
            output_dir = os.path.join(OUTPUT_BASE_PATH, set_type, refnum_with_suffix)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            jpg_output_path = os.path.join(output_dir, 'img.jpg')
            crop_and_save(img, x, y, radius, jpg_output_path)

            # 保存标签为NumPy文件字典
            info = {
                # 'background': bg
                'class': cls,
                'severity': severity
            }
            npy_path = os.path.join(output_dir, 'info_dict.npy')
            np.save(npy_path, info)

            print(f"Processed {refnum} for {set_type} set")
        else:
            print(f"PGM file for {refnum} not found.")

# 处理并保存训练集和测试集
process_and_save(train_lines, 'Train')
process_and_save(test_lines, 'Test')


Processed mdb130 for Train set
Processed mdb025 for Train set
Processed mdb032 for Train set
Processed mdb188 for Train set
Processed mdb256 for Train set
Processed mdb090 for Train set
Processed mdb179 for Train set
Processed mdb081 for Train set
Processed mdb115 for Train set
Processed mdb152 for Train set
Processed mdb209 for Train set
Processed mdb102 for Train set
Processed mdb170 for Train set
Processed mdb191 for Train set
Processed mdb019 for Train set
Processed mdb107 for Train set
Processed mdb274 for Train set
Processed mdb184 for Train set
Processed mdb097 for Train set
Processed mdb134 for Train set
Processed mdb213 for Train set
Processed mdb012 for Train set
Processed mdb202 for Train set
Processed mdb181 for Train set
Processed mdb121 for Train set
Processed mdb142 for Train set
Processed mdb111 for Train set
Processed mdb058 for Train set
Processed mdb178 for Train set
Processed mdb110 for Train set
Processed mdb214 for Train set
Processed mdb015 for Train set
Processe