In [1]:
# ============================================
# CELL 0 — CONFIG (заполни под себя)
# ============================================
import os, sys, subprocess, json, shutil, time, pathlib

# --- GitHub репозиторий проекта ---
REPO_URL        = "https://github.com/STrachov/OCRlty.git"
REPO_BRANCH     = "main"
PROJECT_DIR_NAME= "projects"                                 # папка с кодом на томе: $VROOT/project/<repo>

# --- Arctic-TILT (HF) ---
ARCTIC_TILT_ID  = "Snowflake/snowflake-arctic-tilt-v1.3"

# --- OCR настройки ---
PADDLE_OCR_LANG = "en"   # 'en'/'latin'/'eslav' и т.д.

# --- Экономия бюджета ---
AUTO_STOP_MIN   = 30     # авто-стоп пода через N минут (0 = выключено)

# --- Поведение wheelhouse ---
FORCE_REBUILD_WHEELHOUSE = False  # True → пересобрать колёса в Colab даже если папка не пуста



In [2]:
# ============================================
# CELL 1 — ENV DETECTION & ROOTS
# ============================================
def in_colab() -> bool:
    try:
        import google.colab  # noqa
        return True
    except Exception:
        return False

def in_runpod() -> bool:
    return bool(os.environ.get("RUNPOD_POD_ID") or os.environ.get("RUNPOD_UID"))

COLAB  = in_colab()
RUNPOD = in_runpod()

# Единый корень данных/моделей/кешей
if "RUNPOD_VOLUME_ROOT" in os.environ:
    VROOT = os.environ["RUNPOD_VOLUME_ROOT"]
else:
    if RUNPOD:
        VROOT = "/workspace"  # Pod + Network Volume
    else:
        if COLAB:
            from google.colab import drive
            drive.mount("/content/drive")
            VROOT = '/content/drive/MyDrive/master/OCRlty/runpod'
        else:
            VROOT = os.path.expanduser("~/runpod")

pathlib.Path(VROOT).mkdir(parents=True, exist_ok=True)
print(f"[ENV] COLAB={COLAB} RUNPOD={RUNPOD}")
print(f"[PATH] RUNPOD_VOLUME_ROOT={VROOT}")


Mounted at /content/drive
[ENV] COLAB=True RUNPOD=False
[PATH] RUNPOD_VOLUME_ROOT=/content/drive/MyDrive/master/OCRlty/runpod


In [3]:
# ============================================
# CELL 2 — PATHS, CACHES, PIP CACHE
# ============================================
HF_HOME     = os.path.join(VROOT, ".cache", "huggingface")
PPOCR_HOME  = os.path.join(VROOT, ".cache", "paddleocr")
PIP_CACHE   = os.path.join(VROOT, ".cache", "pip")
WHEELHOUSE  = os.path.join(VROOT, "wheelhouse")
PROJECT_DIR = os.path.join(VROOT, PROJECT_DIR_NAME)
MODELS_DIR  = os.path.join(VROOT, "models")
DATA_DIR    = os.path.join(VROOT, "data")

for p in [HF_HOME, PPOCR_HOME, PIP_CACHE, WHEELHOUSE, PROJECT_DIR, MODELS_DIR, DATA_DIR]:
    pathlib.Path(p).mkdir(parents=True, exist_ok=True)

os.environ["RUNPOD_VOLUME_ROOT"] = VROOT        # унификация путей внутри кода проекта
os.environ["HF_HOME"]            = HF_HOME
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["PPOCR_HOME"]         = PPOCR_HOME
os.environ["PIP_CACHE_DIR"]      = PIP_CACHE

print(json.dumps({
    "HF_HOME": HF_HOME,
    "PPOCR_HOME": PPOCR_HOME,
    "PIP_CACHE_DIR": PIP_CACHE,
    "WHEELHOUSE": WHEELHOUSE,
    "PROJECT_DIR": PROJECT_DIR,
    "MODELS_DIR": MODELS_DIR,
    "DATA_DIR": DATA_DIR
}, indent=2))


{
  "HF_HOME": "/content/drive/MyDrive/master/OCRlty/runpod/.cache/huggingface",
  "PPOCR_HOME": "/content/drive/MyDrive/master/OCRlty/runpod/.cache/paddleocr",
  "PIP_CACHE_DIR": "/content/drive/MyDrive/master/OCRlty/runpod/.cache/pip",
  "WHEELHOUSE": "/content/drive/MyDrive/master/OCRlty/runpod/wheelhouse",
  "PROJECT_DIR": "/content/drive/MyDrive/master/OCRlty/runpod/projects",
  "MODELS_DIR": "/content/drive/MyDrive/master/OCRlty/runpod/models",
  "DATA_DIR": "/content/drive/MyDrive/master/OCRlty/runpod/data"
}


In [4]:
subprocess.check_output(["git","config","--global","user.name"], text=True)

CalledProcessError: Command '['git', 'config', '--global', 'user.name']' returned non-zero exit status 1.

In [5]:
# ============================================
# CELL 3 — GIT: CLONE / PULL (внутрь тома!)
# ============================================
def run(cmd, check=True, env=None):
    print("[RUN]", " ".join(cmd))
    return subprocess.run(cmd, check=check, text=True, capture_output=False, env=env)

def ensure_git_identity():
    # если не настроено — выставим «технические» значения
    try:
        subprocess.check_output(["git","config","--global","user.name"], text=True)
    except subprocess.CalledProcessError:
        run(["git","config","--global","user.name","Your Name"])
        run(["git","config","--global","user.email","you@example.com"])

def clone_or_update_repo():
    ensure_git_identity()
    repo_name = REPO_URL.rstrip("/").split("/")[-1].replace(".git","")
    dst = os.path.join(PROJECT_DIR, repo_name)
    if not os.path.isdir(dst):
        run(["git","clone","--branch",REPO_BRANCH, REPO_URL, dst])
    else:
        run(["git","-C",dst,"fetch","origin"])
        run(["git","-C",dst,"checkout",REPO_BRANCH])
        run(["git","-C",dst,"pull","origin",REPO_BRANCH])
    return dst

REPO_DIR = clone_or_update_repo()
print(f"[OK] Repo at: {REPO_DIR}")



[RUN] git config --global user.name Your Name
[RUN] git config --global user.email you@example.com
[RUN] git clone --branch main https://github.com/STrachov/OCRlty.git /content/drive/MyDrive/master/OCRlty/runpod/projects/OCRlty


CalledProcessError: Command '['git', 'clone', '--branch', 'main', 'https://github.com/STrachov/OCRlty.git', '/content/drive/MyDrive/master/OCRlty/runpod/projects/OCRlty']' returned non-zero exit status 128.

In [6]:
sys.executable

'/usr/bin/python3'

In [7]:
# ============================================
# CELL 4.0 — PYTHON ENV (venv на Pod/локально)
# ============================================
PYTHON_BIN = sys.executable
VENV_DIR   = os.path.join(VROOT, "venv")
USE_VENV   = RUNPOD or (not COLAB)

if USE_VENV:
    if not os.path.isdir(VENV_DIR):
        run([sys.executable,"-m","venv",VENV_DIR])
    PYTHON_BIN = os.path.join(VENV_DIR,"bin","python")
    run([PYTHON_BIN,"-m","pip","install","--upgrade","pip","setuptools","wheel"])
    print("Environment was created successfully")

print(f"[ENV] Python: {PYTHON_BIN}")


[RUN] /usr/bin/python3 -m venv /content/drive/MyDrive/master/OCRlty/runpod/venv


CalledProcessError: Command '['/usr/bin/python3', '-m', 'venv', '/content/drive/MyDrive/master/OCRlty/runpod/venv']' returned non-zero exit status 1.

In [None]:

# ============================================
# CELL 4.1 — (NEW) BUILD WHEELHOUSE IN COLAB
#  - Собираем .whl заранее, чтобы на Pod ставить мгновенно
# ============================================
REQ_TXT = os.path.join(REPO_DIR, "requirements.txt")
REQ_DEV = os.path.join(REPO_DIR, "requirements-dev.txt")

def is_dir_nonempty(p):
    return os.path.isdir(p) and any(pathlib.Path(p).iterdir())

def build_wheelhouse(requirements_txt):
    if not os.path.isfile(requirements_txt):
        return
    # в Colab чаще используем системный pip (не из venv)
    py = sys.executable if COLAB and not USE_VENV else PYTHON_BIN
    # подсказка: добавь в requirements версии под целевую CUDA/torch/paddle
    run([py,"-m","pip","download","-r",requirements_txt,"-d",WHEELHOUSE])

if COLAB:
    needs_build = FORCE_REBUILD_WHEELHOUSE or (not is_dir_nonempty(WHEELHOUSE))
    if needs_build:
        print("[WHEELHOUSE] Building wheelhouse in Colab...")
        build_wheelhouse(REQ_TXT)
        build_wheelhouse(REQ_DEV)
        print("[WHEELHOUSE] Done.")
    else:
        print("[WHEELHOUSE] Exists — skip build.")
else:
    print("[WHEELHOUSE] Not Colab → skip build here (ожидаем wheelhouse уже на томе).")


In [None]:

# ============================================
# CELL 4.2 — INSTALL REQUIREMENTS (из wheelhouse если есть)
# ============================================
def pip_install(requirements_txt):
    if not os.path.isfile(requirements_txt):
        return
    if is_dir_nonempty(WHEELHOUSE):
        run([PYTHON_BIN,"-m","pip","install",
             "--no-index",f"--find-links={WHEELHOUSE}",
             "-r",requirements_txt])
    else:
        run([PYTHON_BIN,"-m","pip","install","-r",requirements_txt])

pip_install(REQ_TXT)
pip_install(REQ_DEV)

# Базовые пакеты, если их нет в requirements (подстраховка)
run([PYTHON_BIN,"-m","pip","install",
     "huggingface_hub>=0.23",
     "paddleocr>=2.7.0.3",
     "opencv-python>=4.9.0.80"])



In [None]:
# ============================================
# CELL 5 — DOWNLOAD ARCTIC-TILT WEIGHTS (HF)
# ============================================
from huggingface_hub import snapshot_download

ARCTIC_DIR = os.path.join(MODELS_DIR, "snowflake-arctic-tilt-v1.3")
if not os.path.isdir(ARCTIC_DIR) or not any(pathlib.Path(ARCTIC_DIR).iterdir()):
    print(f"[DL] Arctic-TILT → {ARCTIC_DIR}")
    hf_token = os.getenv("HF_TOKEN")  # опционально, если нужно
    snapshot_download(
        repo_id=ARCTIC_TILT_ID,
        local_dir=ARCTIC_DIR,
        local_dir_use_symlinks=False,
        token=hf_token,
        allow_patterns=["*"]
    )
else:
    print("[DL] Arctic-TILT already present, skip.")



In [None]:
# ============================================
# CELL 6 — WARMUP: PADDLE OCR (GPU если доступен)
# ============================================
import numpy as np

def warmup_paddle_ocr(lang=PADDLE_OCR_LANG):
    from paddleocr import PaddleOCR
    print(f"[WARMUP] PaddleOCR(lang={lang}) init …")
    ocr = PaddleOCR(use_angle_cls=True, lang=lang, use_gpu=True, show_log=False)
    dummy = (np.zeros((512,512,3), dtype=np.uint8))
    _ = ocr.ocr(dummy, cls=True)
    print("[OK] OCR warmup done.")
    return ocr

try:
    ocr_instance = warmup_paddle_ocr()
except Exception as e:
    print("[WARN] OCR warmup failed (continue):", e)
    ocr_instance = None



In [None]:
# ============================================
# CELL 7 — WARMUP: ARCTIC-TILT (best-effort)
#  (подключи свою обёртку/вызов инференса тут)
# ============================================
def try_warmup_arctic_tilt(arctic_dir=ARCTIC_DIR):
    print("[WARMUP] Arctic-TILT best-effort …")
    try:
        files = list(pathlib.Path(arctic_dir).glob("**/*"))
        total = sum(f.stat().st_size for f in files if f.is_file())
        print(f"[OK] Arctic-TILT files: {len(files)} | size ≈ {total/1e9:.2f} GB")
        print("[NOTE] Подключи реальную инициализацию модели в своём app/tilt/model.py")
    except Exception as e:
        print("[WARN] Arctic warmup skipped:", e)

try_warmup_arctic_tilt()



In [None]:
# ============================================
# CELL 8 — DOCTOR: VERSIONS, CUDA, GPU
# ============================================
def doctor():
    import platform
    info = {"python": platform.python_version()}
    try:
        import torch
        info.update({
            "torch": torch.__version__,
            "torch.cuda.is_available": torch.cuda.is_available(),
            "torch.cuda.device_count": torch.cuda.device_count()
        })
    except Exception:
        info["torch"] = None
    try:
        import paddle
        info.update({
            "paddle": paddle.__version__,
            "paddle.is_compiled_with_cuda": paddle.is_compiled_with_cuda()
        })
    except Exception:
        info["paddle"] = None

    print(json.dumps(info, indent=2))
    try:
        out = subprocess.check_output(["nvidia-smi","-L"], text=True)
        print("[GPU]", out.strip())
    except Exception:
        print("[GPU] nvidia-smi not available.")

doctor()



In [None]:
# ============================================
# CELL 9 — SMOKE TEST (tiny)
# ============================================
# Положи 1–2 тестовых файла в $DATA_DIR (PNG/JPG/PDF)
samples = []
for ext in ("*.png","*.jpg","*.jpeg","*.pdf"):
    samples += list(pathlib.Path(DATA_DIR).glob(ext))

if samples:
    print("[SMOKE] Samples:", [str(p) for p in samples[:3]])
    # OCR — только для картинок (PDF пропусти или разбери отдельно)
    img_candidates = [p for p in samples if p.suffix.lower() in (".png",".jpg",".jpeg")]
    if img_candidates and ocr_instance is not None:
        import cv2
        test_img = str(img_candidates[0])
        img = cv2.imread(test_img)
        res = ocr_instance.ocr(img, cls=True)
        print("[SMOKE] OCR result (first line):", res[0][:1] if res else None)
    else:
        print("[SMOKE] (no images or OCR not initialized)")
else:
    print(f"[SMOKE] Put test files into: {DATA_DIR} (e.g., invoice.jpg)")



In [None]:
# ============================================
# CELL 10 — OPTIONAL: AUTO-STOP POD (экономия)
# ============================================
def which(bin_name: str) -> bool:
    try:
        subprocess.check_output(["bash","-lc", f"command -v {bin_name} >/dev/null && echo OK"], text=True)
        return True
    except Exception:
        return False

def schedule_pod_autostop(minutes=AUTO_STOP_MIN):
    if not RUNPOD or minutes <= 0:
        print("[AUTO-STOP] Skip (not Pod or disabled).")
        return
    if which("runpodctl"):
        cmd = f"bash -lc 'sleep {int(minutes*60)}; runpodctl stop pod $RUNPOD_POD_ID || true' &"
    else:
        # fallback на REST (если захочешь — добавь API_KEY и вызов curl)
        cmd = f"bash -lc 'sleep {int(minutes*60)}; echo AUTOSTOP elapsed (install runpodctl for real stop)' &"
    print(f"[AUTO-STOP] Scheduled in {minutes} min.")
    subprocess.Popen(cmd, shell=True)

schedule_pod_autostop()
print("[READY] Env prepared. Use your scripts in the repo for training/inference.")
