In [6]:
import os
import zipfile
import shutil
import random
import pandas as pd
from pathlib import Path

# Конфигурация
DATASETS_DIR = "folder_with_zip_datasets_text"  # Папка с исходными датасетами (ZIP-архивами)
OUTPUT_DIR = "DATASASET_TRAINING_TEXT"  # Папка для итогового датасета

TRAIN_SAMPLES_PER_DATASET = 5000
VALID_SAMPLES_PER_DATASET = 500
TEST_SAMPLES_PER_DATASET = 500

datasets_description = pd.read_excel('text_datasets_description.xlsx')
datasets_description = datasets_description.dropna()

def process_labels(labels_dir, dataset_path, dataset_name):
    """Обрабатывает файлы с метками, применяя файл datasets_description.xlsx"""
    
    global datasets_description
    print(dataset_name)
    target_datasets_description = datasets_description[datasets_description['Dataset'] == dataset_name].copy()
    # print(target_datasets_description)
    for label_file in Path(labels_dir).glob("*.txt"):
        with open(label_file, 'r+') as file:
            #try:
            content = file.read()
            content = content.split('\n')

            new_content = []
            for box in content:
                splitted_box = box.split(' ')
                # print('Before swap:', splitted_box)
                try:
                    splitted_box[0] = target_datasets_description[target_datasets_description['Class_Number_Init'] == int(splitted_box[0])]['Class_Number_Target'].astype(int).astype(str).values[0]
                except:
                    pass
                # print('After swap:', splitted_box)
                box = ' '.join(splitted_box)
                new_content.append(box)

            file.truncate(0)
            file.seek(0)

            for row in new_content:
                file.write(row + '\n')

            file.close()
#             except:
#                 print('Filename:', file)
#                 print('Error')
#                 print('-'*15)
                
def copy_random_samples(source_dir, dest_dir, num_samples):
    """Копирует случайные изображения и соответствующие метки"""
    source_images = list((source_dir / "images").glob("*"))
    if not source_images:
        return
    
    # Выбираем случайные изображения (не более чем есть)
    selected_images = random.sample(source_images, min(num_samples, len(source_images)))
    
    # Создаем папки назначения, если их нет
    (dest_dir / "images").mkdir(parents=True, exist_ok=True)
    (dest_dir / "labels").mkdir(parents=True, exist_ok=True)
    
    for img_path in selected_images:
        # Копируем изображение
        shutil.copy(img_path, dest_dir / "images" / img_path.name)
        
        # Копируем соответствующую метку
        label_path = (source_dir / "labels" / img_path.with_suffix(".txt").name)
        if label_path.exists():
            shutil.copy(label_path, dest_dir / "labels" / label_path.name)
            
def process_dataset(zip_path, output_base, dataset_name):
    """Обрабатывает один датасет"""
    # Создаем временную папку для распаковки
    temp_dir = Path("temp_extract")
    temp_dir.mkdir(exist_ok=True)
    
    # Распаковываем архив
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)
    
    # Ищем папки train, valid, test
    dataset_folders = {}
    for folder in ["train", "valid", "test"]:
        for possible_path in temp_dir.rglob(folder):
            if possible_path.is_dir():
                dataset_folders[folder] = possible_path
                break
    
    # Обрабатываем метки и копируем данные
    for folder_type, source_path in dataset_folders.items():
        # Обрабатываем метки
        labels_dir = source_path / "labels"
        if labels_dir.exists():
            process_labels(labels_dir, source_path, dataset_name)
        
        # Определяем сколько samples брать
        if folder_type == "train":
            num_samples = TRAIN_SAMPLES_PER_DATASET
        elif folder_type == "valid":
            num_samples = VALID_SAMPLES_PER_DATASET
        else:  # test
            num_samples = TEST_SAMPLES_PER_DATASET
        
        # Копируем samples
        
        if folder_type == 'valid' or folder_type == 'test':
            dest_dir = output_base / 'valid'
            copy_random_samples(source_path, dest_dir, num_samples)
        else:
            dest_dir = output_base / folder_type
            copy_random_samples(source_path, dest_dir, num_samples)
    
    # Удаляем временную папку
    shutil.rmtree(temp_dir)
    
def main():
    # Создаем структуру итогового датасета
    output_base = Path(OUTPUT_DIR)
    for folder in ["train", "valid", "test"]:
        (output_base / folder).mkdir(parents=True, exist_ok=True)
    
    # Обрабатываем все ZIP-архивы в папке с датасетами
    datasets = list(Path(DATASETS_DIR).glob("*.zip"))
    for dataset in datasets:
        print(f"Processing {dataset.name}...")
        process_dataset(dataset, output_base, dataset.name.replace('.zip', ''))
#     dataset = list(Path(DATASETS_DIR).glob("*.zip"))[17]
#     process_dataset(dataset, output_base, dataset.name.replace('.zip', ''))

In [7]:
main()

Processing YOLOML_III.v1-yoloml_iii.yolov11.zip...
YOLOML_III.v1-yoloml_iii.yolov11
YOLOML_III.v1-yoloml_iii.yolov11
YOLOML_III.v1-yoloml_iii.yolov11


# Models Training

## Model for Text Recognition

In [1]:
from ultralytics import YOLO
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

# Load a model
model = YOLO("yolo11s.pt")

In [2]:
%%time

# Train the model
train_results = model.train(
    data="custom_data.yaml",  # path to dataset YAML
    # epochs=30,  # number of training epochs
    time=4,
    imgsz=640,  # training image size
    device=0,  # device to run on, i.e. device=0 or device=0,1,2,3 or device=cpu
    project='text_recognition',
    name='training_3',
    exist_ok=True,
    val=True,
    batch=0.7,
    save_period=50,
    patience=7,
    seed=22
    # cache='disk'
    # pretrained=False
)

Ultralytics 8.3.146  Python-3.9.21 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3060, 12288MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=0.7, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=custom_data.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11s.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=training_3, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=7, perspective=0.0, plots=True, pose=1

[34m[1mtrain: [0mScanning D:\Studying\2nd Course 2024 and 2025\project_odsv_v2\features_extracting\object_detection\datasets\cust[0m

[34m[1mAutoBatch: [0mComputing optimal batch size for imgsz=640 at 70.0% CUDA memory utilization.
[34m[1mAutoBatch: [0mCUDA:0 (NVIDIA GeForce RTX 3060) 12.00G total, 0.14G reserved, 0.11G allocated, 11.75G free
      Params      GFLOPs  GPU_mem (GB)  forward (ms) backward (ms)                   input                  output





     9428179       21.55         0.849         20.87         227.6        (1, 3, 640, 640)                    list
     9428179        43.1         1.191         28.84         120.9        (2, 3, 640, 640)                    list
     9428179       86.19         1.747         49.68         117.2        (4, 3, 640, 640)                    list
     9428179       172.4         2.840         67.37         107.2        (8, 3, 640, 640)                    list
     9428179       344.8         4.933         95.13         142.8       (16, 3, 640, 640)                    list
[34m[1mAutoBatch: [0mUsing batch-size 27 for CUDA:0 8.18G/12.00G (68%) 
[34m[1mtrain: [0mFast image access  (ping: 0.00.0 ms, read: 101.926.5 MB/s, size: 24.5 KB)


[34m[1mtrain: [0mScanning D:\Studying\2nd Course 2024 and 2025\project_odsv_v2\features_extracting\object_detection\datasets\cust[0m


[34m[1mval: [0mFast image access  (ping: 0.30.1 ms, read: 64.115.8 MB/s, size: 28.2 KB)


[34m[1mval: [0mScanning D:\Studying\2nd Course 2024 and 2025\project_odsv_v2\features_extracting\object_detection\datasets\custom[0m


Plotting labels to text_recognition\training_3\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.000421875), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mtext_recognition\training_3[0m
Starting training for 4 hours...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      1/100       6.6G      1.952      2.681      1.552        122        640: 100%|██████████| 52/52 [00:19<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04


                   all        601       1801      0.537      0.777      0.607      0.274

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      2/558      6.61G        1.3     0.9693      1.163        118        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.161      0.816      0.156     0.0732






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      3/593      6.62G      1.314     0.9666       1.18        117        640: 100%|██████████| 52/52 [00:17<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.412      0.391      0.319      0.142






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      4/607      6.63G      1.336     0.9802      1.205        134        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.608      0.631      0.676      0.304






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      5/613      6.65G      1.254     0.9065      1.139        136        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.494      0.597      0.533      0.245






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      6/615      6.63G      1.193     0.8633      1.118        112        640: 100%|██████████| 52/52 [00:17<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.665      0.796      0.798      0.468






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      7/620      6.62G      1.188     0.8268      1.096        135        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.849      0.904      0.931      0.582






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      8/621      6.65G      1.142     0.8114      1.089         91        640: 100%|██████████| 52/52 [00:17<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.821      0.843      0.897      0.542






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      9/625      6.62G      1.098     0.7716      1.062        116        640: 100%|██████████| 52/52 [00:17<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.795      0.848      0.887      0.547






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     10/629      6.62G      1.095     0.7463      1.062        129        640: 100%|██████████| 52/52 [00:17<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.873      0.901       0.94      0.634






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     11/631      6.62G      1.088     0.7386      1.057        110        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.812      0.881      0.907      0.578






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     12/631      6.62G      1.076     0.7343      1.055        123        640: 100%|██████████| 52/52 [00:17<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.687      0.803      0.757      0.403






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     13/634      6.65G      1.066     0.7254      1.041        135        640: 100%|██████████| 52/52 [00:17<00:00,  3.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.838      0.911      0.929      0.599






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     14/636      6.66G      1.028     0.6941      1.037        103        640: 100%|██████████| 52/52 [00:17<00:00,  3.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.814      0.875      0.904      0.604






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     15/637      6.66G      1.032     0.6999      1.034        105        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.904      0.913      0.957      0.662






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     16/637      6.64G      1.019     0.6803      1.022        105        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.811      0.922      0.909      0.632






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     17/635      6.66G      0.974     0.6615      1.009        112        640: 100%|██████████| 52/52 [00:19<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.913      0.933      0.965      0.689






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     18/633      6.63G     0.9975      0.657      1.011        118        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.951      0.937      0.979      0.721






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     19/632      6.61G     0.9883     0.6525      1.009         81        640: 100%|██████████| 52/52 [00:21<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.835      0.887      0.923      0.625






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     20/626      6.63G      1.008     0.6639       1.02         96        640: 100%|██████████| 52/52 [00:21<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.909      0.922      0.961      0.671






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     21/621      6.62G     0.9882     0.6314      1.019        120        640: 100%|██████████| 52/52 [00:18<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.904      0.922      0.962      0.696






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     22/621      6.63G     0.9792     0.6458      1.008        108        640: 100%|██████████| 52/52 [00:20<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.908      0.933      0.968      0.713






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     23/618      6.66G      0.989     0.6521      1.021        121        640: 100%|██████████| 52/52 [00:19<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.902      0.929      0.963      0.663






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     24/618      6.62G     0.9514     0.6366       1.01        127        640: 100%|██████████| 52/52 [00:19<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04

                   all        601       1801      0.919      0.932      0.967      0.679






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


     25/616      6.62G     0.9558      0.644     0.9982        130        640: 100%|██████████| 52/52 [00:17<00:00,  2.
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:03

                   all        601       1801      0.918      0.925      0.972      0.702
[34m[1mEarlyStopping: [0mTraining stopped early as no improvement observed in last 7 epochs. Best results observed at epoch 18, best model saved as best.pt.
To update EarlyStopping(patience=7) pass a new patience value, i.e. `patience=300` or use `patience=0` to disable EarlyStopping.






25 epochs completed in 0.162 hours.
Optimizer stripped from text_recognition\training_3\weights\last.pt, 19.2MB
Optimizer stripped from text_recognition\training_3\weights\best.pt, 19.2MB

Validating text_recognition\training_3\weights\best.pt...
Ultralytics 8.3.146  Python-3.9.21 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3060, 12288MiB)
YOLO11s summary (fused): 100 layers, 9,413,187 parameters, 0 gradients, 21.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:04


                   all        601       1801       0.95      0.937      0.979      0.721
Speed: 0.2ms preprocess, 3.6ms inference, 0.0ms loss, 1.0ms postprocess per image
Results saved to [1mtext_recognition\training_3[0m
CPU times: total: 10min 27s
Wall time: 10min 36s


In [7]:
video_id = '7507994789435002134'
results = model('../../parsing/formatted_videos/{}.mp4'.format(video_id), save=True, iou=0.1, vid_stride=10, show=False, conf=0.5, verbose=False)

inference results will accumulate in RAM unless `stream=True` is passed, causing potential out-of-memory
errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

Results saved to [1mtext_recognition\training_1[0m
