Теперь пробуем создать файл только на текстах, в которых не была использована технология OCR. Проверку делаем по `meta_06_25.tsv`, в которой хранится метаинформация файлов

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install rarfile

Collecting rarfile
  Downloading rarfile-4.2-py3-none-any.whl.metadata (4.4 kB)
Downloading rarfile-4.2-py3-none-any.whl (29 kB)
Installing collected packages: rarfile
Successfully installed rarfile-4.2


In [4]:
import os
import csv
import re
import rarfile

In [5]:
RAR_FILE = "/content/drive/MyDrive/txt.rar"
META_FILE = "/content/drive/MyDrive/meta_06_25.tsv"
OUTPUT = "/content/drive/MyDrive/1_Kabard_NoOCR.txt"


no_ocr_files = set()

with open(META_FILE, encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")
    for row in reader:
        source = row.get("ocr") or row.get("формат") or row.get("source")
        if source in ("Издательский формат", "электронное издание"):
            filename = row.get("filename")
            if filename:
                no_ocr_files.add(filename)

print("Файлов без OCR:", len(no_ocr_files))


def read_text_any_encoding(file_obj):
    raw = file_obj.read()
    for enc in (
        "utf-8",
        "utf-8-sig",
        "cp1251",
        "windows-1251",
        "utf-16",
        "macroman",
    ):
        try:
            return raw.decode(enc)
        except UnicodeDecodeError:
            continue
    return raw.decode("utf-8", errors="ignore")


def normalize_text(text):
    text = re.sub(r"[\r\n\t]+", " ", text)

    text = re.sub(r"\s{2,}", " ", text)

    return text.strip()

texts = []
used_files = 0

with rarfile.RarFile(RAR_FILE) as rf:
    for name in sorted(rf.namelist()):
        info = rf.getinfo(name)
        if info.is_dir():
            continue

        short_name = os.path.basename(name)

        if short_name not in no_ocr_files:
            continue

        with rf.open(name) as f:
            text = read_text_any_encoding(f)

        if not text or not text.strip():
            continue

        text = normalize_text(text)

        if text:
            texts.append(text)
            used_files += 1

result = " ".join(texts)

with open(OUTPUT, "w", encoding="utf-8") as f:
    f.write(result)


print("Файлов использовано:", used_files)
print(
    "Итоговый размер (MB):",
    round(len(result.encode("utf-8")) / 1024 / 1024, 2),
)



Файлов без OCR: 20991
Файлов использовано: 20893
Итоговый размер (MB): 314.34
