In [9]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
import urllib.request
import shutil

# 类别标签 → 数字标签
label_dict = {
    "lollipop": 0,
    "binoculars": 1,
    "mouse": 2,
    "basket": 3,
    "penguin": 4,
    "washing machine": 5,
    "canoe": 6,
    "eyeglasses": 7,
    "beach": 8,
    "screwdriver": 9,
}

# 参数：每类最多下载多少张图像（默认 7000，可修改）
max_images_per_class = 7000

# 下载源链接（修正后）
npy_base_url = "https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/"

# 本地路径
npy_dir = "./npy_files/"
output_dir = "./quickdraw-png_set1/"

# 创建文件夹
os.makedirs(npy_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# 遍历类目
for label_name, label_index in label_dict.items():
    npy_filename = f"{label_name}.npy"
    npy_path = os.path.join(npy_dir, npy_filename)

    # ✅ 下载 .npy 文件（如果不存在）
    if not os.path.exists(npy_path):
        url = npy_base_url + urllib.parse.quote(npy_filename)
        print(f"Downloading {label_name} from {url} ...")
        try:
            urllib.request.urlretrieve(url, npy_path)
            print(f"Downloaded {label_name} ✅")
        except Exception as e:
            print(f"Failed to download {label_name}: {e}")
            continue
    else:
        print(f"{label_name} already downloaded ✅")

    # ✅ 加载并保存 PNG
    print(f"Processing {label_name} ...")
    data = np.load(npy_path, allow_pickle=False)  # shape: (N, 784)
    total = min(len(data), max_images_per_class)

    class_dir = os.path.join(output_dir, f"{label_index}_{label_name}")
    os.makedirs(class_dir, exist_ok=True)

    for i in tqdm(range(total), desc=f"Saving {label_name}"):
        img_array = data[i].reshape(28, 28).astype(np.uint8)
        img = Image.fromarray(img_array, mode='L')
        img.save(os.path.join(class_dir, f"{label_name}_{i:05d}.png"))

# 删除整个 npy_dir 文件夹（包含其中所有文件）
shutil.rmtree(npy_dir)

Downloading lollipop from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/lollipop.npy ...
Downloaded lollipop ✅
Processing lollipop ...


Saving lollipop: 100%|█████████████████████████████████████████████████████████████| 7000/7000 [00:21<00:00, 328.44it/s]


Downloading binoculars from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/binoculars.npy ...
Downloaded binoculars ✅
Processing binoculars ...


Saving binoculars: 100%|███████████████████████████████████████████████████████████| 7000/7000 [00:15<00:00, 439.06it/s]


Downloading mouse from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/mouse.npy ...
Downloaded mouse ✅
Processing mouse ...


Saving mouse: 100%|████████████████████████████████████████████████████████████████| 7000/7000 [00:15<00:00, 448.03it/s]


Downloading basket from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/basket.npy ...
Downloaded basket ✅
Processing basket ...


Saving basket: 100%|███████████████████████████████████████████████████████████████| 7000/7000 [00:15<00:00, 442.51it/s]


Downloading penguin from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/penguin.npy ...
Downloaded penguin ✅
Processing penguin ...


Saving penguin: 100%|██████████████████████████████████████████████████████████████| 7000/7000 [00:16<00:00, 432.14it/s]


Downloading washing machine from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/washing%20machine.npy ...
Downloaded washing machine ✅
Processing washing machine ...


Saving washing machine: 100%|██████████████████████████████████████████████████████| 7000/7000 [00:16<00:00, 429.23it/s]


Downloading canoe from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/canoe.npy ...
Downloaded canoe ✅
Processing canoe ...


Saving canoe: 100%|████████████████████████████████████████████████████████████████| 7000/7000 [00:16<00:00, 437.47it/s]


Downloading eyeglasses from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/eyeglasses.npy ...
Downloaded eyeglasses ✅
Processing eyeglasses ...


Saving eyeglasses: 100%|███████████████████████████████████████████████████████████| 7000/7000 [00:15<00:00, 438.98it/s]


Downloading beach from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/beach.npy ...
Downloaded beach ✅
Processing beach ...


Saving beach: 100%|████████████████████████████████████████████████████████████████| 7000/7000 [00:15<00:00, 441.69it/s]


Downloading screwdriver from https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/screwdriver.npy ...
Downloaded screwdriver ✅
Processing screwdriver ...


Saving screwdriver: 100%|██████████████████████████████████████████████████████████| 7000/7000 [00:16<00:00, 435.36it/s]


In [18]:
import os
import pandas as pd
from tqdm import tqdm

# 指定图像主目录
base_dir = "./quickdraw-png_set1"

# 获取所有子目录（类别）
all_classes = sorted([
    d for d in os.listdir(base_dir)
    if os.path.isdir(os.path.join(base_dir, d))
], key=lambda x: int(x.split("_")[0]))  # 按标签数字排序

# 存储路径和标签
all_data = []

for class_folder in tqdm(all_classes, desc="读取类别文件夹"):
    label = int(class_folder.split("_")[0])
    class_path = os.path.join(base_dir, class_folder)
    filenames = sorted(os.listdir(class_path))  # 按文件名排序
    
    for fname in filenames:
        rel_path = os.path.join(class_folder, fname)  # 相对路径
        all_data.append((rel_path, label))

# 转成 DataFrame
df = pd.DataFrame(all_data, columns=["Path", "Label"])

# 按类别分组划分 train / val / test
train_list, val_list, test_list = [], [], []

for label in sorted(df['Label'].unique()):
    cls_df = df[df["Label"] == label].reset_index(drop=True)
    test_list.append(cls_df.iloc[:1000])    # 前1000为测试
    val_list.append(cls_df.iloc[1000:2000]) # 中1000为验证
    train_list.append(cls_df.iloc[2000:])   # 剩下5000为训练

# 拼接并打乱
df_train = pd.concat(train_list).sample(frac=1, random_state=42).reset_index(drop=True)  # ✅ 打乱训练集
df_val = pd.concat(val_list).reset_index(drop=True)    # ❌ 不打乱验证集
df_test = pd.concat(test_list).reset_index(drop=True)  # ❌ 不打乱测试集

# 保存 CSV
df_train.to_csv("quickdraw_png_set1_train.csv", index=False)
df_val.to_csv("quickdraw_png_set1_valid.csv", index=False)
df_test.to_csv("quickdraw_png_set1_test.csv", index=False)

print("✅ 数据集划分完成并保存为 CSV：")
print("- quickdraw_png_set1_train.csv")
print("- quickdraw_png_set1_valid.csv")
print("- quickdraw_png_set1_test.csv")


读取类别文件夹: 100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 53.84it/s]


✅ 数据集划分完成并保存为 CSV：
- quickdraw_png_set1_train.csv
- quickdraw_png_set1_valid.csv
- quickdraw_png_set1_test.csv
