# CoCo2017 Data Download

In [1]:
import os  
import requests  
import zipfile  
from tqdm.notebook import tqdm  
import time  
import shutil  

In [2]:
# 创建存储目录  
base_dir = "./coco2017"  # 修改为您希望保存数据集的位置  
os.makedirs(base_dir, exist_ok=True)  
os.makedirs(os.path.join(base_dir, "images"), exist_ok=True)  

# 定义下载函数  
def download_file(url, save_path):  
    if os.path.exists(save_path):  
        print(f"文件 {save_path} 已存在，跳过下载")  
        return  
    
    print(f"正在下载: {url}")  
    response = requests.get(url, stream=True)  
    total_size = int(response.headers.get('content-length', 0))  
    block_size = 1024 * 1024  # 1MB  
    
    with open(save_path, 'wb') as f:  
        progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)  
        for data in response.iter_content(block_size):  
            progress_bar.update(len(data))  
            f.write(data)  
        progress_bar.close()  
    
    print(f"下载完成: {save_path}")  

# 定义解压函数  
def extract_zip(zip_path, extract_to):  
    print(f"解压文件: {zip_path}")  
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:  
        for member in tqdm(zip_ref.infolist(), desc='解压中'):  
            try:  
                zip_ref.extract(member, extract_to)  
            except zipfile.error as e:  
                print(f"解压错误: {e}")  
    print(f"解压完成")  

# 下载和解压文件  
files_to_download = [  
    {  
        "name": "训练集图像",  
        "url": "http://images.cocodataset.org/zips/train2017.zip",  
        "save_path": os.path.join(base_dir, "train2017.zip"),  
        "extract_to": os.path.join(base_dir, "images")  
    },  
    {  
        "name": "验证集图像",  
        "url": "http://images.cocodataset.org/zips/val2017.zip",  
        "save_path": os.path.join(base_dir, "val2017.zip"),  
        "extract_to": os.path.join(base_dir, "images")  
    },  
    {  
        "name": "测试集图像",  
        "url": "http://images.cocodataset.org/zips/test2017.zip",  
        "save_path": os.path.join(base_dir, "test2017.zip"),  
        "extract_to": os.path.join(base_dir, "images")  
    },  
    {  
        "name": "标注文件",  
        "url": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip",  
        "save_path": os.path.join(base_dir, "annotations_trainval2017.zip"),  
        "extract_to": base_dir  
    }  
]  

In [3]:
# 执行下载和解压  
for file_info in files_to_download[-2:-1]:  
    print(f"\n开始处理: {file_info['name']}")  
    download_file(file_info["url"], file_info["save_path"])  
    extract_zip(file_info["save_path"], file_info["extract_to"])  
    
    # 可选：删除zip文件以节省空间  
    # os.remove(file_info["save_path"])  
    # print(f"已删除zip文件: {file_info['save_path']}")  

print("\nCOCO 2017数据集下载和解压完成！")  


开始处理: 测试集图像
文件 ./coco2017\test2017.zip 已存在，跳过下载
解压文件: ./coco2017\test2017.zip


解压中:   0%|          | 0/40671 [00:00<?, ?it/s]

解压完成

COCO 2017数据集下载和解压完成！


In [4]:
# 验证文件数量  
train_count = len(os.listdir(os.path.join(base_dir, "images", "train2017")))  
val_count = len(os.listdir(os.path.join(base_dir, "images", "val2017")))  
print(f"训练集图像数量: {train_count} (应为 118,287)")  
print(f"验证集图像数量: {val_count} (应为 5,000)")

训练集图像数量: 118287 (应为 118,287)
验证集图像数量: 5000 (应为 5,000)
