In [2]:
import pandas as pd
import pydicom
import os

# 读取CSV文件
file_path = '../datafile.csv'  # 将your_file.csv替换为你的CSV文件路径
df = pd.read_csv(file_path)

df = df.iloc[:230000]

# 定义获取SOPInstanceUID的函数
def get_sop_instance_uid(dcm_path):
    try:
        ds = pydicom.dcmread(dcm_path)
        return ds.SOPInstanceUID
    except Exception as e:
        print(f"Error reading {dcm_path}: {e}")
        return None

# 获取SOPInstanceUID并修改路径
df['SOP'] = df['dicom'].apply(get_sop_instance_uid)
df['Png'] = df['dicom'].apply(lambda x: x.replace('.dcm', '.png'))

# 保存更新后的DataFrame到新的CSV文件
new_file_path = '../updated_file.csv'  # 指定新的CSV文件路径
df.to_csv(new_file_path, index=False)


In [1]:
import os
from tqdm import tqdm

# 原始数据集路径
data_dir = "../yolo"
images_dir = os.path.join(data_dir, "images")

# 获取所有 .png 文件名
image_files = [f for f in os.listdir(images_dir) if f.endswith('.png')]

# 删除所有 .png 文件并显示进度条
for img_file in tqdm(image_files, desc="Deleting .png files"):
    img_path = os.path.join(images_dir, img_file)
    try:
        os.remove(img_path)
        # 打印删除成功信息
        # print(f"Deleted {img_path}")  # 可选，打印删除的文件路径
    except Exception as e:
        print(f"Error deleting {img_path}: {e}")


Deleting .png files: 100%|██████████| 110322/110322 [00:37<00:00, 2942.90it/s]


In [2]:
import os
from tqdm import tqdm

# 原始数据集路径
data_dir = "../yolo"
images_dir = os.path.join(data_dir, "images")
labels_dir = os.path.join(data_dir, "labels")

# 要删除的文件后缀
suffixes = [f"_aug_{i}" for i in range(6)]

# 删除 images 文件夹中的目标文件
image_files = [f for f in os.listdir(images_dir) if any(f.endswith(f"{suffix}.jpg") for suffix in suffixes)]
for img_file in tqdm(image_files, desc="Deleting images"):
    img_path = os.path.join(images_dir, img_file)
    try:
        os.remove(img_path)
    except Exception as e:
        print(f"Error deleting {img_path}: {e}")

# 删除 labels 文件夹中的目标文件
label_files = [f for f in os.listdir(labels_dir) if any(f.endswith(f"{suffix}.txt") for suffix in suffixes)]
for lbl_file in tqdm(label_files, desc="Deleting labels"):
    lbl_path = os.path.join(labels_dir, lbl_file)
    try:
        os.remove(lbl_path)
    except Exception as e:
        print(f"Error deleting {lbl_path}: {e}")


Deleting images: 100%|██████████| 110322/110322 [00:33<00:00, 3304.56it/s]
Deleting labels: 100%|██████████| 110322/110322 [00:32<00:00, 3403.42it/s]


In [2]:
import random
import os
from tqdm import tqdm
import shutil
data_dir = "../yolo"
# 临时增强数据集路径
aug_images_dir = os.path.join(data_dir, "augmented_images")
aug_labels_dir = os.path.join(data_dir, "augmented_labels")

# 分割后的数据集路径
train_dir = "../augmented_yolo/train"
val_dir = "../augmented_yolo/validation"
test_dir = "../augmented_yolo/test"
augmented_images=[f for f in os.listdir(aug_images_dir) if f.endswith('.jpg')]
# 随机分割数据集
random.shuffle(augmented_images)
train_split = int(0.7 * len(augmented_images))
val_split = int(0.85 * len(augmented_images))
# augmented_images里面保存的是什么？
for i, img_file in enumerate(tqdm(augmented_images, desc="Splitting dataset")):
    src_img_path = os.path.join(aug_images_dir, img_file)
    src_txt_path = os.path.join(aug_labels_dir, img_file.replace(".jpg",".txt"))
    
    if i < train_split:
        dst_dir = train_dir
    elif i < val_split:
        dst_dir = val_dir
    else:
        dst_dir = test_dir
        
    try:
        shutil.copy(src_img_path, os.path.join(dst_dir, "images", img_file))
        shutil.copy(src_txt_path, os.path.join(dst_dir, "labels", img_file.replace('.jpg', '.txt')))
    except Exception as e:
        print(f"Error copying {img_file}: {e}")

Splitting dataset: 100%|██████████| 110322/110322 [07:25<00:00, 247.49it/s]


In [1]:
import os 

print(len(os.listdir("../yolo/augmented_labels")))

110322


In [2]:
import os
import random
import shutil
from tqdm import tqdm

# 原始数据集路径
data_dir = "../yolo"

# 临时增强数据集路径
aug_images_dir = os.path.join(data_dir, "augmented_images")
aug_labels_dir = os.path.join(data_dir, "augmented_labels")

# 分割后的数据集路径
train_dir = "../augmented_yolo_1/train"
val_dir = "../augmented_yolo_1/validation"
test_dir = "../augmented_yolo_1/test"

# 创建目标文件夹
os.makedirs(aug_images_dir, exist_ok=True)
os.makedirs(aug_labels_dir, exist_ok=True)
os.makedirs(os.path.join(train_dir, "images"), exist_ok=True)
os.makedirs(os.path.join(train_dir, "labels"), exist_ok=True)
os.makedirs(os.path.join(val_dir, "images"), exist_ok=True)
os.makedirs(os.path.join(val_dir, "labels"), exist_ok=True)
os.makedirs(os.path.join(test_dir, "images"), exist_ok=True)
os.makedirs(os.path.join(test_dir, "labels"), exist_ok=True)

# 获取所有图片文件名
augmented_images = [f for f in os.listdir(aug_images_dir) if f.endswith('.jpg')]

# 随机分割数据集
random.shuffle(augmented_images)
train_split = int(0.7 * len(augmented_images))
val_split = int(0.85 * len(augmented_images))
# augmented_images里面保存的是什么？
for i, img_file in enumerate(tqdm(augmented_images, desc="Splitting dataset")):
    src_img_path = os.path.join(aug_images_dir, img_file)
    src_txt_path = os.path.join(aug_labels_dir, img_file.replace(".jpg",".txt"))
    
    if i < train_split:
        dst_dir = train_dir
    elif i < val_split:
        dst_dir = val_dir
    else:
        dst_dir = test_dir
        
    try:
        shutil.copy(src_img_path, os.path.join(dst_dir, "images", img_file))
        shutil.copy(src_txt_path, os.path.join(dst_dir, "labels", img_file.replace('.jpg', '.txt')))
    except Exception as e:
        print(f"Error copying {img_file}: {e}")

Splitting dataset: 100%|██████████| 110322/110322 [07:46<00:00, 236.61it/s]
