In [None]:
!apt-get update
!apt-get install -y fonts-dejavu
!apt-get install -y fontconfig
!fc-cache -fv

from google.colab import drive
drive.mount('/content/drive')

import os
import random
import numpy as np
from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageEnhance
import zipfile
from io import BytesIO
import re

In [None]:
LANG_PREFIX = "Adyghe_OCRFixed"

FONT_SIZE = 24
DPI = 150
LINE_SPACING = 1.2
MARGIN_PX = 20

HAS_SKEW = [0, 1]        # 0 - нет, 1 - есть
HAS_THICKNESS = [0, 1]   # 0 - нет, 1 - есть

TMP_DIR = "/content/tmp_txt"


def get_cyrillic_fonts():
    return {
        "Sans": "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
        "Serif": "/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf",
        "Mono": "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf",
    }


def extract_part_counter(filename):
    nums = re.findall(r"\d+", filename)
    if not nums:
        raise ValueError(f"нет числа в имени файла: {filename}")
    return int(nums[0])


def split_text_into_lines_by_width(text, font, max_width):
    words = text.split()
    lines = []
    current = []

    for word in words:
        current.append(word)
        bbox = font.getbbox(" ".join(current))
        if bbox[2] > max_width:
            current.pop()
            lines.append(" ".join(current))
            current = [word]

    if current:
        lines.append(" ".join(current))

    return lines


def create_background(width, height):
    base = Image.new("RGB", (width, height), (245, 245, 245))
    noise = np.random.normal(0, 1.0, (height, width, 3))
    arr = np.clip(np.array(base) + noise, 0, 255).astype(np.uint8)
    return Image.fromarray(arr)


def add_scan_distortions(image, intensity=0.25):
    arr = np.array(image).astype(np.float32)

    # лёгкий шум
    if random.random() < intensity:
        arr += np.random.normal(0, random.uniform(0.5, 1.2), arr.shape)

    # слабая яркость
    if random.random() < 0.3 * intensity:
        arr = arr * random.uniform(0.97, 1.03)

    arr = np.clip(arr, 0, 255).astype(np.uint8)
    return Image.fromarray(arr)


def generate_from_txt_zip(
    txt_zip_path,
    images_zip_path,
    gt_zip_path,
    file_slice=(0, None)
):
    fonts = get_cyrillic_fonts()
    os.makedirs(TMP_DIR, exist_ok=True)

    # читаем zip с txt
    with zipfile.ZipFile(txt_zip_path, "r") as zin:
        txt_files = sorted([f for f in zin.namelist() if f.endswith(".txt")])
        txt_files = txt_files[file_slice[0]:file_slice[1]]
        zin.extractall(TMP_DIR)

    # zipы на выход (с возможностью добавлять)
    images_zip = zipfile.ZipFile(images_zip_path, "a", zipfile.ZIP_DEFLATED)
    gt_zip = zipfile.ZipFile(gt_zip_path, "a", zipfile.ZIP_DEFLATED)

    for txt_name in txt_files:
        part_counter = extract_part_counter(txt_name)

        with open(os.path.join(TMP_DIR, txt_name), encoding="utf-8") as f:
            text = re.sub(r"\s+", " ", f.read()).strip()

        for font_label, font_path in fonts.items():
            font = ImageFont.truetype(font_path, FONT_SIZE)

            max_width = 2000  # безопасный предел
            lines = split_text_into_lines_by_width(text, font, max_width)

            line_height = int(FONT_SIZE * LINE_SPACING)
            text_height = line_height * len(lines)
            text_width = max(font.getbbox(line)[2] for line in lines)

            img_w = text_width + 2 * MARGIN_PX
            img_h = text_height + 2 * MARGIN_PX

            for skew in HAS_SKEW:
                for thick in HAS_THICKNESS:

                    img = create_background(img_w, img_h)
                    draw = ImageDraw.Draw(img)

                    y = MARGIN_PX
                    for ln in lines:
                        if thick:
                            draw.text(
                                (MARGIN_PX + 1, y + 1),
                                ln,
                                font=font,
                                fill=(90, 90, 90)
                            )
                        draw.text(
                            (MARGIN_PX, y),
                            ln,
                            font=font,
                            fill=(0, 0, 0)
                        )
                        y = y + line_height

                    if skew:
                        img = img.rotate(
                            random.uniform(-1.2, 1.2),
                            expand=True,
                            fillcolor=(245, 245, 245)
                        )

                    img = add_scan_distortions(img, intensity=0.25)

                    base_name = (
                        f"{LANG_PREFIX}_"
                        f"{part_counter:06d}_"
                        f"{font_label}_"
                        f"{skew}_"
                        f"{thick}"
                    )

                    img_buf = BytesIO()
                    img.save(img_buf, format="PNG", dpi=(DPI, DPI))
                    images_zip.writestr(f"{base_name}.png", img_buf.getvalue())

                    gt_zip.writestr(f"{base_name}.gt.txt", text)

    images_zip.close()
    gt_zip.close()

In [None]:
generate_from_txt_zip(
    txt_zip_path="/content/drive/MyDrive/input_texts.zip",
    images_zip_path="/content/drive/MyDrive/generated_images.zip",
    gt_zip_path="/content/drive/MyDrive/generated_gt.zip",
    file_slice=(0, 1000)
)