In [None]:
!rm -rf ~/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-OCR-2*
!rm -rf ~/.cache/huggingface/modules/transformers_modules/deepseek-ai--DeepSeek-OCR-2*

!pip uninstall -y transformers tokenizers flash-attn
!pip uninstall -y torchvision torchaudio
!pip install --no-cache torch==2.6.0 --index-url https://download.pytorch.org/whl/cu118
!pip install --no-cache transformers==4.46.3 tokenizers==0.20.3
!pip install torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
!pip install einops addict easydict

In [None]:
import torch
import torchvision
print(torch.__version__)       # Should be 2.6.0
print(torchvision.__version__) # Should be 0.21.0
import transformers
print(transformers.__version__) # Should be 4.46.3 from earlier fix

In [None]:
!pip install num2words python-docx

In [None]:
import os
import json
import logging
import torch
from num2words import num2words
from docx import Document
from docx.shared import Pt, Cm
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.text import WD_ALIGN_PARAGRAPH
from openai import OpenAI
from transformers import AutoModel, AutoTokenizer
import time
import random
import gc
import re

# ================= –ù–ê–°–¢–†–û–ô–ö–ò =================
API_KEY = "your-openrouter-api-key-here" 
BASE_URL = "https://openrouter.ai/api/v1"
model_name = 'deepseek-ai/DeepSeek-OCR-2'

os.environ["CUDA_VISIBLE_DEVICES"] = '0,1'
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

BATCH_FOLDERS = [
    "folder/path/one", 
    "folder/path/two",
    "folder/path/three"
]

OUTPUT_FILE = "GOTOVAYA_OPIS_FULL.docx"
TEMP_OUTPUT_DIR = "temp_ocr_results"
PROGRESS_FILE = "progress.json"  # –§–∞–π–ª –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –ø—Ä–æ–≥—Ä–µ—Å—Å–∞

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# ================= –ò–ù–ò–¶–ò–ê–õ–ò–ó–ê–¶–ò–Ø –ú–û–î–ï–õ–ï–ô =================
logging.info("–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ DeepSeek-OCR...")

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_name, 
    trust_remote_code=True, 
    use_safetensors=True
)
model = model.eval().cuda().to(torch.bfloat16)

client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

In [None]:
def clean_ocr_text(text):
    """–£–¥–∞–ª—è–µ–º —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã–µ —Ç–æ–∫–µ–Ω—ã —Ä–∞–∑–º–µ—Ç–∫–∏"""
    if not text:
        return ""
    
    text = re.sub(r'<\|ref\|>.*?<\|/ref\|>', '', text)
    text = re.sub(r'<\|det\|>\[\[.*?\]\]<\|/det\|>', '', text)
    text = re.sub(r'<\|grounding\|>', '', text)
    text = re.sub(r'<\|.*?\|>', '', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text.strip()


def collect_all_cases(batch_folders):
    """–°–æ–±–∏—Ä–∞–µ—Ç –≤—Å–µ –ø–∞–ø–∫–∏ —Å –¥–µ–ª–∞–º–∏"""
    all_cases = []
    for batch_name in batch_folders:
        if not os.path.exists(batch_name):
            logging.warning(f"–ü–∞–ø–∫–∞ '{batch_name}' –Ω–µ –Ω–∞–π–¥–µ–Ω–∞! –ü—Ä–æ–ø—É—Å–∫–∞—é.")
            continue
        items = sorted(os.listdir(batch_name))
        for item in items:
            full_path = os.path.join(batch_name, item)
            if os.path.isdir(full_path) and not item.startswith('.'):
                all_cases.append(full_path)
    return all_cases


def run_ocr_on_folder(folder_path):
    """–ß–∏—Ç–∞–µ—Ç –í–°–ï —Ñ–æ—Ç–æ –≤–Ω—É—Ç—Ä–∏ –ø–∞–ø–∫–∏ –¥–µ–ª–∞"""
    full_text = ""
    files = sorted(os.listdir(folder_path))
    image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    
    if not image_files:
        return None

    logging.info(f"--- –û–±—Ä–∞–±–æ—Ç–∫–∞: {folder_path} ({len(image_files)} —Ñ–æ—Ç–æ) ---")
    
    if not os.path.exists(TEMP_OUTPUT_DIR):
        os.makedirs(TEMP_OUTPUT_DIR)

    result_file = os.path.join(TEMP_OUTPUT_DIR, "result.mmd")

    for filename in image_files:
        image_path = os.path.join(folder_path, filename)
        
        torch.cuda.empty_cache()
        gc.collect()
        
        if os.path.exists(result_file):
            os.remove(result_file)
        
        try:
            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
            res = model.infer(tokenizer, prompt=prompt, image_file=image_path, 
                            output_path=TEMP_OUTPUT_DIR, base_size=1024, 
                            image_size=768, crop_mode=True, save_results=True)
            
            if os.path.exists(result_file):
                with open(result_file, 'r', encoding='utf-8') as f:
                    text_content = f.read()
                
                text_content = clean_ocr_text(text_content)
                
                if text_content:
                    full_text += f"\n\n=== –§–ê–ô–õ: {filename} ===\n"
                    full_text += text_content
                    full_text += f"\n=== –ö–û–ù–ï–¶ –§–ê–ô–õ–ê ===\n"
                    logging.info(f"‚úì {filename}: –ø–æ–ª—É—á–µ–Ω–æ {len(text_content)} —Å–∏–º–≤–æ–ª–æ–≤")
                else:
                    logging.warning(f"‚úó {filename}: –ø—É—Å—Ç–æ–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏")
            else:
                logging.warning(f"‚úó {filename}: result.mmd –Ω–µ —Å–æ–∑–¥–∞–Ω")
            
        except Exception as e:
            logging.error(f"–û—à–∏–±–∫–∞ OCR {filename}: {e}")
            torch.cuda.empty_cache()
            gc.collect()

    return full_text


def analyze_structure_with_api(ocr_text, max_retries=6):
    """–ê–Ω–∞–ª–∏–∑–∏—Ä—É–µ—Ç —Ç–µ–∫—Å—Ç –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç —Å—Ç—Ä—É–∫—Ç—É—Ä–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ"""
    
    ocr_text = clean_ocr_text(ocr_text)
    
    system_prompt = """
–¢—ã ‚Äî –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç—É–∞–ª—å–Ω—ã–π –∞—Ä—Ö–∏–≤–∞—Ä–∏—É—Å. –ù–∞ –≤—Ö–æ–¥–µ ‚Äî OCR —Ç–µ–∫—Å—Ç–∞ –ø–∞–ø–∫–∏/–¥–µ–ª–∞.

–¢–≤–æ—è –∑–∞–¥–∞—á–∞: —Å–æ—Å—Ç–∞–≤–∏—Ç—å –∑–∞–ø–∏—Å—å –¥–ª—è –æ–ø–∏—Å–∏. –í–µ—Ä–Ω–∏ –¢–û–õ–¨–ö–û JSON (–±–µ–∑ ```).

–ü–†–ê–í–ò–õ–ê:
1) –ï—Å–ª–∏ –µ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–∏ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ ‚Äî –ù–ï–õ–¨–ó–Ø –ø–∏—Å–∞—Ç—å, —á—Ç–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –Ω–µ—Ç.
2) –ï—Å–ª–∏ —ç—Ç–æ –Ω–∞–±–æ—Ä —Ä–∞–∑–Ω–æ—Ä–æ–¥–Ω—ã—Ö –º–∞—Ç–µ—Ä–∏–∞–ª–æ–≤ ‚Äî "–ü–æ–¥–±–æ—Ä–∫–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ (‚Ä¶–≤–∏–¥—ã –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤‚Ä¶)".

–ü–û–õ–Ø JSON:
- "title": –∞—Ä—Ö–∏–≤–Ω–æ–µ –Ω–∞–∑–≤–∞–Ω–∏–µ + –≤ —Å–∫–æ–±–∫–∞—Ö –≤–∏–¥—ã –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
- "date": –∫—Ä–∞–π–Ω–∏–µ –¥–∞—Ç—ã (—Ñ–æ—Ä–º–∞—Ç "2016" –∏–ª–∏ "2016-2017"). –ï—Å–ª–∏ –Ω–µ –Ω–∞–π–¥–µ–Ω–æ ‚Äî ""
- "pages": –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ª–∏—Å—Ç–æ–≤ (—á–∏—Å–ª–æ). –ï—Å–ª–∏ –Ω–µ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–æ ‚Äî ""
- "storage": —Å—Ä–æ–∫ —Ö—Ä–∞–Ω–µ–Ω–∏—è. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é "5 –ª–µ—Ç"
- "index": –∏–Ω–¥–µ–∫—Å/–Ω–æ–º–µ—Ä –¥–µ–ª–∞. –ï—Å–ª–∏ –Ω–µ –Ω–∞–π–¥–µ–Ω ‚Äî ""

–§–æ—Ä–º–∞—Ç –æ—Ç–≤–µ—Ç–∞: –¢–û–õ–¨–ö–û JSON-–æ–±—ä–µ–∫—Ç, –±–µ–∑ –ø–æ—è—Å–Ω–µ–Ω–∏–π.
"""

    truncated_text = ocr_text[:120000]
    logging.info(f"–û—Ç–ø—Ä–∞–≤–ª—è–µ–º –≤ API —Ç–µ–∫—Å—Ç –¥–ª–∏–Ω–æ–π {len(truncated_text)} —Å–∏–º–≤–æ–ª–æ–≤")
    logging.info(f"–ü–µ—Ä–≤—ã–µ 500 —Å–∏–º–≤–æ–ª–æ–≤:\n{truncated_text[:500]}")

    base_delay = 1.5

    for attempt in range(1, max_retries + 1):
        try:
            response = client.chat.completions.create(
                model="xiaomi/mimo-v2-flash",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"–í–æ—Ç —Ä–∞—Å–ø–æ–∑–Ω–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç:\n\n{truncated_text}"},
                ],
                temperature=0.1,
                max_tokens=2500
            )

            content = response.choices[0].message.content
            logging.info(f"–û—Ç–≤–µ—Ç API: {content}")
            
            content = content.replace("```json", "").replace("```", "").strip()
            return json.loads(content)

        except Exception as e:
            logging.error(f"[API ERROR] –ü–æ–ø—ã—Ç–∫–∞ {attempt}/{max_retries}: {e}")

            if attempt == max_retries:
                return {
                    "title": "–û—à–∏–±–∫–∞ –∞–Ω–∞–ª–∏–∑–∞",
                    "date": "",
                    "pages": "",
                    "storage": "5 –ª–µ—Ç",
                    "index": ""
                }

            delay = base_delay * (2 ** (attempt - 1))
            jitter = random.uniform(0.8, 1.2)
            time.sleep(delay * jitter)


# ================= –°–û–•–†–ê–ù–ï–ù–ò–ï/–ó–ê–ì–†–£–ó–ö–ê –ü–†–û–ì–†–ï–°–°–ê =================

def save_progress(table_rows, processed_paths):
    """–°–æ—Ö—Ä–∞–Ω—è–µ—Ç –ø—Ä–æ–≥—Ä–µ—Å—Å –≤ JSON —Ñ–∞–π–ª"""
    data = {
        'table_rows': table_rows,
        'processed_paths': processed_paths
    }
    with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    logging.info(f"üíæ –ü—Ä–æ–≥—Ä–µ—Å—Å —Å–æ—Ö—Ä–∞–Ω—ë–Ω: {len(table_rows)} –∑–∞–ø–∏—Å–µ–π")


def load_progress():
    """–ó–∞–≥—Ä—É–∂–∞–µ—Ç –ø—Ä–æ–≥—Ä–µ—Å—Å –∏–∑ JSON —Ñ–∞–π–ª–∞"""
    if os.path.exists(PROGRESS_FILE):
        try:
            with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
                data = json.load(f)
            logging.info(f"üìÇ –ó–∞–≥—Ä—É–∂–µ–Ω –ø—Ä–æ–≥—Ä–µ—Å—Å: {len(data.get('table_rows', []))} –∑–∞–ø–∏—Å–µ–π")
            return data.get('table_rows', []), set(data.get('processed_paths', []))
        except Exception as e:
            logging.warning(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –ø—Ä–æ–≥—Ä–µ—Å—Å–∞: {e}")
    return [], set()


# ================= –°–û–ó–î–ê–ù–ò–ï WORD –î–û–ö–£–ú–ï–ù–¢–ê =================

def create_opis_document(table_rows, output_path):
    """–°–æ–∑–¥–∞—ë—Ç Word –¥–æ–∫—É–º–µ–Ω—Ç —Å –æ–ø–∏—Å—å—é –ë–ï–ó —à–∞–±–ª–æ–Ω–∞"""
    
    doc = Document()
    
    # –ù–∞—Å—Ç—Ä–æ–π–∫–∞ —Å—Ç—Ä–∞–Ω–∏—Ü—ã (–∞–ª—å–±–æ–º–Ω–∞—è A4)
    section = doc.sections[0]
    section.page_width = Cm(29.7)
    section.page_height = Cm(21)
    section.left_margin = Cm(2)
    section.right_margin = Cm(1.5)
    section.top_margin = Cm(1.5)
    section.bottom_margin = Cm(1.5)
    
    # === –®–ê–ü–ö–ê ===
    def add_text(text, size=14, bold=False, center=False):
        para = doc.add_paragraph()
        if center:
            para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.add_run(text)
        run.font.size = Pt(size)
        run.font.name = 'Times New Roman'
        run.font.bold = bold
        return para
    
    add_text("–ì–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–æ–µ –∞–≤—Ç–æ–Ω–æ–º–Ω–æ–µ —É—á—Ä–µ–∂–¥–µ–Ω–∏–µ", center=True)
    add_text("¬´–ü–†–ò–ú–ï–† –ü–†–ò–ú–ï–† –ü–†–ò–ú–ï–†", center=True)
    
    doc.add_paragraph()
    
    # –ë–ª–æ–∫ –£–¢–í–ï–†–ñ–î–ê–Æ
    approve = doc.add_paragraph()
    approve.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    run_app = approve.add_run(
        "–£–¢–í–ï–†–ñ–î–ê–Æ\n"
        "–†–£–ö–û–í–û–î–ò–¢–ï–õ–¨ —Å–ª—É–∂–±—ã\n"
        "____________–ò.–ò. –ò–í–ê–ù–û–í\n"
        "¬´___¬ª_____________ 2026 –≥."
    )
    run_app.font.size = Pt(12)
    run_app.font.name = 'Times New Roman'
    
    doc.add_paragraph()
    
    add_text("–û–ø–∏—Å—å ‚Ññ ____", size=16, bold=True, center=True)
    add_text("–∑–∞ 2010-2026 –≥–æ–¥—ã", size=14, center=True)
    
    doc.add_paragraph()
    
    # === –¢–ê–ë–õ–ò–¶–ê ===
    table = doc.add_table(rows=1, cols=7)
    table.style = 'Table Grid'
    table.alignment = WD_TABLE_ALIGNMENT.CENTER
    
    # –ó–∞–≥–æ–ª–æ–≤–∫–∏
    headers = ['‚Ññ –ø/–ø', '–ò–Ω–¥–µ–∫—Å –¥–µ–ª–∞', '–ó–∞–≥–æ–ª–æ–≤–æ–∫ –¥–µ–ª–∞', '–î–∞—Ç–∞ –¥–µ–ª–∞', 
               '–°—Ä–æ–∫ —Ö—Ä–∞–Ω–µ–Ω–∏—è', '–ö–æ–ª-–≤–æ –ª–∏—Å—Ç–æ–≤', '–ü—Ä–∏–º.']
    
    header_row = table.rows[0]
    for i, header in enumerate(headers):
        cell = header_row.cells[i]
        para = cell.paragraphs[0]
        para.clear()
        run = para.add_run(header)
        run.font.bold = True
        run.font.size = Pt(10)
        run.font.name = 'Times New Roman'
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    # –ù—É–º–µ—Ä–∞—Ü–∏—è –∫–æ–ª–æ–Ω–æ–∫
    num_row = table.add_row()
    for i in range(7):
        cell = num_row.cells[i]
        para = cell.paragraphs[0]
        para.clear()
        run = para.add_run(str(i + 1))
        run.font.size = Pt(9)
        run.font.name = 'Times New Roman'
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    # === –î–ê–ù–ù–´–ï ===
    for row_data in table_rows:
        row = table.add_row()
        
        # –ò–∑–≤–ª–µ–∫–∞–µ–º –∑–Ω–∞—á–µ–Ω–∏—è
        values = [
            (str(row_data.get('num', '')), True),
            (str(row_data.get('index', '') or ''), True),
            (str(row_data.get('title', '–ù–µ —Ä–∞—Å–ø–æ–∑–Ω–∞–Ω–æ')), False),
            (str(row_data.get('date', '') or ''), True),
            (str(row_data.get('storage', '5 –ª–µ—Ç') or '5 –ª–µ—Ç'), True),
            (str(row_data.get('pages', '') or ''), True),
            (str(row_data.get('note', '') or ''), False)
        ]
        
        for i, (value, center) in enumerate(values):
            cell = row.cells[i]
            para = cell.paragraphs[0]
            para.clear()
            run = para.add_run(value)
            run.font.size = Pt(10)
            run.font.name = 'Times New Roman'
            if center:
                para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    doc.add_paragraph()
    
    # === –ò–¢–û–ì–û–í–ê–Ø –ó–ê–ü–ò–°–¨ ===
    total = len(table_rows)
    try:
        total_words = num2words(total, lang='ru')
    except:
        total_words = str(total)
    
    summary = doc.add_paragraph()
    run_sum = summary.add_run(
        f"–í –¥–∞–Ω–Ω—ã–π —Ä–∞–∑–¥–µ–ª –æ–ø–∏—Å–∏ –≤–Ω–µ—Å–µ–Ω–æ {total} ({total_words}) –¥–µ–ª "
        f"—Å ‚Ññ 1 –ø–æ ‚Ññ {total}, –≤ —Ç–æ–º —á–∏—Å–ª–µ:\n"
        f"–ª–∏—Ç–µ—Ä–Ω—ã–µ –Ω–æ–º–µ—Ä–∞: ‚Äî\n"
        f"–ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –Ω–æ–º–µ—Ä–∞: ‚Äî"
    )
    run_sum.font.size = Pt(12)
    run_sum.font.name = 'Times New Roman'
    
    doc.add_paragraph()
    doc.add_paragraph()
    
    # –ü–æ–¥–ø–∏—Å–∏
    signatures = doc.add_paragraph()
    run_sign = signatures.add_run(
        "–ì–ª–∞–≤–Ω—ã–π —Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç _________________ –ò.–û.–§–∞–º–∏–ª–∏—è\n\n"
        "¬´_____¬ª___________ 2026 –≥.\n\n\n"
        "–ü–µ—Ä–µ–¥–∞–ª ______________________________________________ –µ–¥. —Ö—Ä.\n"
        "                    (—Ü–∏—Ñ—Ä–∞–º–∏ –∏ –ø—Ä–æ–ø–∏—Å—å—é)\n\n\n"
        "–ì–ª–∞–≤–Ω—ã–π —Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç –û–û–î _________________ –ò.–û.–§–∞–º–∏–ª–∏—è\n\n"
        "¬´_____¬ª___________ 2026 –≥.\n\n\n"
        "–ü—Ä–∏–Ω—è–ª ______________________________________________ –µ–¥. —Ö—Ä.\n"
        "                    (—Ü–∏—Ñ—Ä–∞–º–∏ –∏ –ø—Ä–æ–ø–∏—Å—å—é)\n\n"
        "–ù–∞–∏–º–µ–Ω–æ–≤–∞–Ω–∏–µ –¥–æ–ª–∂–Ω–æ—Å—Ç–∏ —Ä–∞–±–æ—Ç–Ω–∏–∫–∞ –∞—Ä—Ö–∏–≤–∞ _________________ –ò.–û.–§–∞–º–∏–ª–∏—è\n\n"
        "¬´_____¬ª___________ 2026 –≥."
    )
    run_sign.font.size = Pt(12)
    run_sign.font.name = 'Times New Roman'
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º
    doc.save(output_path)
    logging.info(f"‚úÖ Word –¥–æ–∫—É–º–µ–Ω—Ç —Å–æ—Ö—Ä–∞–Ω—ë–Ω: {output_path}")


# ================= –ì–õ–ê–í–ù–ê–Ø –§–£–ù–ö–¶–ò–Ø =================

def main():
    logging.info("–°–∫–∞–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–∞–ø–æ–∫...")
    all_cases_paths = collect_all_cases(BATCH_FOLDERS)
    
    if not all_cases_paths:
        print("–ù–µ –Ω–∞–π–¥–µ–Ω–æ –Ω–∏ –æ–¥–Ω–æ–π –ø–∞–ø–∫–∏ —Å –¥–µ–ª–∞–º–∏!")
        return

    print(f"–í—Å–µ–≥–æ –Ω–∞–π–¥–µ–Ω–æ –¥–µ–ª –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏: {len(all_cases_paths)}")
    
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –ø—Ä–æ–≥—Ä–µ—Å—Å –µ—Å–ª–∏ –µ—Å—Ç—å
    table_rows, processed_paths = load_progress()
    
    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Å—Ç–∞—Ä—Ç–æ–≤—ã–π –Ω–æ–º–µ—Ä
    start_num = len(table_rows) + 1
    
    if processed_paths:
        print(f"üìÇ –ü—Ä–æ–¥–æ–ª–∂–∞–µ–º —Å –¥–µ–ª–∞ #{start_num} (—É–∂–µ –æ–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {len(processed_paths)})")

    for idx, folder_path in enumerate(all_cases_paths, 1):
        
        # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º —É–∂–µ –æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–µ
        if folder_path in processed_paths:
            continue
        
        torch.cuda.empty_cache()
        gc.collect()
        
        markdown_text = run_ocr_on_folder(folder_path)
        
        if not markdown_text or len(markdown_text.strip()) < 50:
            logging.warning(f"–ü—É—Å—Ç–∞—è –ø–∞–ø–∫–∞ –∏–ª–∏ –Ω–µ—Ç –¥–∞–Ω–Ω—ã—Ö: {folder_path}")
            processed_paths.add(folder_path)
            continue
        
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º OCR –¥–ª—è –æ—Ç–ª–∞–¥–∫–∏
        debug_file = os.path.join(TEMP_OUTPUT_DIR, f"debug_{len(table_rows)+1}.txt")
        with open(debug_file, 'w', encoding='utf-8') as f:
            f.write(markdown_text)
            
        data = analyze_structure_with_api(markdown_text)
        
        row = {
            'num': len(table_rows) + 1,
            'index': data.get('index', ''),
            'title': data.get('title', '–ù–µ —Ä–∞—Å–ø–æ–∑–Ω–∞–Ω–æ'),
            'date': data.get('date', ''),
            'storage': data.get('storage', '5 –ª–µ—Ç'),
            'pages': data.get('pages', ''),
            'note': ''
        }
        table_rows.append(row)
        processed_paths.add(folder_path)
        
        print(f"--> [{len(table_rows)}/{len(all_cases_paths)}] –ì–æ—Ç–æ–≤–æ: {row['title'][:80]}...")
        
        # ‚≠ê –ê–í–¢–û–°–û–•–†–ê–ù–ï–ù–ò–ï –ø–æ—Å–ª–µ –∫–∞–∂–¥–æ–≥–æ –¥–µ–ª–∞!
        save_progress(table_rows, list(processed_paths))
        
        torch.cuda.empty_cache()
        gc.collect()

    # –§–∏–Ω–∞–ª—å–Ω–æ–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ
    if table_rows:
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º JSON
        final_json = "final_records.json"
        with open(final_json, 'w', encoding='utf-8') as f:
            json.dump(table_rows, f, ensure_ascii=False, indent=2)
        print(f"üíæ JSON —Å–æ—Ö—Ä–∞–Ω—ë–Ω: {final_json}")
        
        # –°–æ–∑–¥–∞—ë–º Word
        try:
            create_opis_document(table_rows, OUTPUT_FILE)
            print(f"\n{'='*50}")
            print(f"‚úÖ –£–°–ü–ï–•! –û–ø–∏—Å—å —Å–æ–∑–¥–∞–Ω–∞: {OUTPUT_FILE}")
            print(f"üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ –¥–µ–ª: {len(table_rows)}")
            print(f"{'='*50}")
        except Exception as e:
            logging.error(f"–û—à–∏–±–∫–∞ —Å–æ–∑–¥–∞–Ω–∏—è Word: {e}")
            print(f"‚ö†Ô∏è Word –Ω–µ —Å–æ–∑–¥–∞–Ω, –Ω–æ –¥–∞–Ω–Ω—ã–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {final_json}")
    else:
        print("‚ùå –ù–µ—Ç –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è")


In [None]:
main()

In [None]:
# === –í–û–°–°–¢–ê–ù–û–í–õ–ï–ù–ò–ï –ò–ó JSON (–µ—Å–ª–∏ –Ω—É–∂–Ω–æ –ø–µ—Ä–µ—Å–æ–∑–¥–∞—Ç—å Word) ===

def recreate_word_from_json(json_path="final_records.json", output_path="OPIS_RESTORED.docx"):
    """–ü–µ—Ä–µ—Å–æ–∑–¥–∞—ë—Ç Word –∏–∑ —Å–æ—Ö—Ä–∞–Ω—ë–Ω–Ω–æ–≥–æ JSON"""
    with open(json_path, 'r', encoding='utf-8') as f:
        table_rows = json.load(f)
    
    print(f"üìÇ –ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(table_rows)} –∑–∞–ø–∏—Å–µ–π –∏–∑ {json_path}")
    create_opis_document(table_rows, output_path)
    print(f"‚úÖ Word —Å–æ–∑–¥–∞–Ω: {output_path}")

# –†–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π –µ—Å–ª–∏ –Ω—É–∂–Ω–æ:
# recreate_word_from_json("progress.json", "OPIS_FROM_PROGRESS.docx")