In [1]:
import os
import requests
from bs4 import BeautifulSoup
from PIL import Image
from zipfile import ZipFile
import random
import io

In [2]:
def download_images(url, min_size=(100, 100), max_size=(3000, 3000)):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    img_urls = set()
    for img in soup.find_all('img'):
        img_url = img.get('src')
        if img_url and (img_url.endswith('.jpg') or img_url.endswith('.jpeg') or img_url.endswith('.png') or img_url.endswith('.webp')):
            try:
                img_res = Image.open(io.BytesIO(requests.get(img_url).content))
                if (min_size[0] <= img_res.size[0] <= max_size[0]) and (min_size[1] <= img_res.size[1] <= max_size[1]):
                    img_urls.add(img_url)
            except Exception as e:
                print(f'Ошибка при открытии изображения {img_url}: {e}')
    zip_filename = 'images.zip'
    added_images = set()
    with ZipFile(zip_filename, 'w') as zipf:
        for img_url in img_urls:
            img_data = requests.get(img_url).content
            img_name = os.path.basename(img_url)
            if img_name not in added_images:
                zipf.writestr(img_name, img_data)
                added_images.add(img_name)
    print(f"Картинки сохранены в архив '{zip_filename}'")
    return zip_filename

In [3]:
def create_yolo_structure(zip_file_path, output_dir, proportions=(0.8, 0.2)):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    train_dir = os.path.join(output_dir, 'train')
    val_dir = os.path.join(output_dir, 'val')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    with ZipFile(zip_file_path, 'r') as zipf:
        zipf.extractall(output_dir)
    images = [f for f in os.listdir(output_dir) if f.endswith(('.jpg', '.jpeg', '.png', '.webp'))]
    random.shuffle(images)
    train_count = int(len(images) * proportions[0])
    train_images = images[:train_count]
    val_images = images[train_count:]
    def safe_rename(src, dest):
        base, extension = os.path.splitext(dest)
        counter = 1
        while os.path.exists(dest):
            dest = f"{base}_{counter}{extension}"
            counter += 1
        os.rename(src, dest)
    for img in train_images:
        safe_rename(os.path.join(output_dir, img), os.path.join(train_dir, img))
    for img in val_images:
        safe_rename(os.path.join(output_dir, img), os.path.join(val_dir, img))
    print(f"Наборы данных сохранены в папку '{output_dir}'")

In [4]:
url = 'https://www.drom.ru/'
zip_file = download_images(url)
create_yolo_structure(zip_file, output_dir='yolo_dataset', proportions=(0.8, 0.2))

Ошибка при открытии изображения //c.rdrom.ru/skin/blogs/sq-sm/60-x-60-song-plus.jpg: Invalid URL '//c.rdrom.ru/skin/blogs/sq-sm/60-x-60-song-plus.jpg': No scheme supplied. Perhaps you meant https:////c.rdrom.ru/skin/blogs/sq-sm/60-x-60-song-plus.jpg?
Ошибка при открытии изображения //c.rdrom.ru/skin/blogs/sq-sm/60-x-60-moskvich.png: Invalid URL '//c.rdrom.ru/skin/blogs/sq-sm/60-x-60-moskvich.png': No scheme supplied. Perhaps you meant https:////c.rdrom.ru/skin/blogs/sq-sm/60-x-60-moskvich.png?
Ошибка при открытии изображения //c.rdrom.ru/skin/blogs/sq-sm/60-x-60-niva.jpg: Invalid URL '//c.rdrom.ru/skin/blogs/sq-sm/60-x-60-niva.jpg': No scheme supplied. Perhaps you meant https:////c.rdrom.ru/skin/blogs/sq-sm/60-x-60-niva.jpg?
Ошибка при открытии изображения //c.rdrom.ru/skin/blogs/sq-sm/60-x-60-arkana.jpg: Invalid URL '//c.rdrom.ru/skin/blogs/sq-sm/60-x-60-arkana.jpg': No scheme supplied. Perhaps you meant https:////c.rdrom.ru/skin/blogs/sq-sm/60-x-60-arkana.jpg?
Картинки сохранены в ар