In [1]:
import os, time

# תיקיית עבודה מקומית בקאגל (לשמור מודלים/פלטים)
WORK_ROOT = "/kaggle/working/SEM_SAM2"
DIRS = {
    "root": WORK_ROOT,
    "models": f"{WORK_ROOT}/models",   # לכאן יישמרו משקלים/קונפיג אם נוריד
    "output": f"{WORK_ROOT}/output",   # לכאן יישמרו תוצאות overlay ומניפסטים
}

for d in DIRS.values():
    os.makedirs(d, exist_ok=True)

# מאיפה באות התמונות?
# אפשרות 1 (פשוטה): לחפש בכל /kaggle/input את כל התמונות מכל הדאטאסטים שצירפת ל־Notebook
INPUT_DIRS = ["/kaggle/input"]

# אפשרות 2 (מומלצת כשיש הרבה דאטאסטים): צייני במפורש
# INPUT_DIRS = ["/kaggle/input/your-sem-dataset", "/kaggle/input/another-ds"]

# שורש ליחסיות במניפסט (רק קוסמטי לתצוגת נתיבים)
MAP_ROOT = "/kaggle"

print("Images will be read from:", INPUT_DIRS)
print("Outputs will be written under:", DIRS["output"])


Images will be read from: ['/kaggle/input']
Outputs will be written under: /kaggle/working/SEM_SAM2/output


In [2]:
!pip -q install opencv-python matplotlib supervision
!pip -q install git+https://github.com/facebookresearch/segment-anything-2.git

import torch, cv2, numpy as np, glob, os
print("CUDA available:", torch.cuda.is_available())


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.2/207.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m:00:01[

In [3]:
!wget -qO- https://httpbin.org/ip | head -c 80


{
  "origin": "34.151.136.75"
}


In [4]:
import os, urllib.request

CFG_NAME = "sam2_hiera_l.yaml"
CKPT_NAME = "sam2_hiera_large.pt"

CFG_PATH  = os.path.join(DIRS["models"], CFG_NAME)
CKPT_PATH = os.path.join(DIRS["models"], CKPT_NAME)

CFG_URL  = "https://huggingface.co/spaces/SkalskiP/segment-anything-model-2/resolve/main/configs/sam2_hiera_l.yaml"
CKPT_URL = "https://huggingface.co/spaces/SkalskiP/segment-anything-model-2/resolve/main/checkpoints/sam2_hiera_large.pt"

os.makedirs(DIRS["models"], exist_ok=True)

def download(url, dst):
    print(f"Downloading -> {dst}")
    urllib.request.urlretrieve(url, dst)
    sz = os.path.getsize(dst) / (1024*1024)
    print(f"Saved {dst} ({sz:.1f} MB)")

if not os.path.exists(CFG_PATH):
    download(CFG_URL, CFG_PATH)
else:
    print("Config exists:", CFG_PATH)

if not os.path.exists(CKPT_PATH):
    download(CKPT_URL, CKPT_PATH)  # ~900MB
else:
    print("Weights exist:", CKPT_PATH)

print("Ready.")


Downloading -> /kaggle/working/SEM_SAM2/models/sam2_hiera_l.yaml
Saved /kaggle/working/SEM_SAM2/models/sam2_hiera_l.yaml (0.0 MB)
Downloading -> /kaggle/working/SEM_SAM2/models/sam2_hiera_large.pt
Saved /kaggle/working/SEM_SAM2/models/sam2_hiera_large.pt (856.4 MB)
Ready.


In [5]:
SAM2_PARAMS = dict(
    points_per_side=32,
    points_per_batch=64,
    pred_iou_thresh=0.7,
    stability_score_thresh=0.92,
    stability_score_offset=0.7,
    crop_n_layers=1,
    box_nms_thresh=0.7,
)

ALPHA = 0.45
MAX_IMAGES = None   # למשל 5 כדי לבדוק מהר
print("SAM2 params:", SAM2_PARAMS)


SAM2 params: {'points_per_side': 32, 'points_per_batch': 64, 'pred_iou_thresh': 0.7, 'stability_score_thresh': 0.92, 'stability_score_offset': 0.7, 'crop_n_layers': 1, 'box_nms_thresh': 0.7}


In [6]:
import numpy as np, cv2

def to_rgb(img):
    if img.ndim == 2:
        return cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

def colorize_instances_sam2(masks, h, w):
    overlay = np.zeros((h, w, 3), dtype=np.uint8)
    n = max(len(masks), 1)
    for i, m in enumerate(masks):
        seg = m["segmentation"]
        hue = int(180 * i / n)
        color_hsv = np.uint8([[[hue, 200, 255]]])
        color_bgr = cv2.cvtColor(color_hsv, cv2.COLOR_HSV2BGR)[0,0,:]
        color_rgb = color_bgr[::-1]
        overlay[seg] = color_rgb
    return overlay

def blend_overlay(rgb, overlay_rgb, alpha=0.45):
    return (rgb * (1 - alpha) + overlay_rgb * alpha).astype(np.uint8)


In [7]:
from sam2.build_sam import build_sam2
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
import sam2, shutil

device = "cuda" if torch.cuda.is_available() else "cpu"

CFG_NAME_FOR_HYDRA = "sam2_hiera_l.yaml"  # Hydra מצפה לשם, לא לנתיב

# אם הקובץ לא ארוז בחבילה—נעתיק אותו לשם פעם אחת
SAM2_PKG_DIR = os.path.dirname(sam2.__file__)
PKG_CFG = os.path.join(SAM2_PKG_DIR, CFG_NAME_FOR_HYDRA)
if (not os.path.exists(PKG_CFG)) and os.path.exists(CFG_PATH):
    try:
        shutil.copy(CFG_PATH, PKG_CFG)
        print("Copied YAML into package:", PKG_CFG)
    except Exception as e:
        print("Note:", e)

sam2_model = build_sam2(CFG_NAME_FOR_HYDRA, CKPT_PATH, device=device)
mask_generator = SAM2AutomaticMaskGenerator(model=sam2_model, **SAM2_PARAMS)

print("SAM2 loaded on", device)


SAM2 loaded on cuda


In [8]:
# === תא 7 (SAM2 batch-run) — שומר מסכת סגמנטציה צבעונית בלבד (ללא overlay) + ZIP לכל באצ' ===
import os, glob, csv, time, cv2, random, shutil
from datetime import datetime

# דרישות מוקדמות (מוגדרים בתאים קודמים אצלך):
# - INPUT_DIRS:    list[str] תיקיות קלט לחיפוש תמונות
# - DIRS:          dict עם מפתח "root" ו-"output"
# - mask_generator: אובייקט SAM-2 שמחזיר רשימת מסכות (dicts) ע"י .generate(np.ndarray RGB)
# - to_rgb:        פונקציה שמקבלת תמונת BGR/GRAY/ALPHA ומחזירה RGB
# - colorize_instances_sam2(masks, h, w): פונקציית צביעה לאובייקטים (ללא בלנד)
#   * אם אין לך אותה, כתבי לי — אתן מימוש קטן.

# איסוף קבצים מכל ה־datasets שברשימה, רקורסיבית, כל סיומות רלוונטיות
PATTERNS = [
    "**/*.png","**/*.jpg","**/*.jpeg","**/*.tif","**/*.tiff","**/*.bmp",
    "**/*.PNG","**/*.JPG","**/*.JPEG","**/*.TIF","**/*.TIFF","**/*.BMP",
]
all_paths = []
for base in INPUT_DIRS:
    for pat in PATTERNS:
        all_paths += glob.glob(os.path.join(base, pat), recursive=True)

all_paths = sorted(set(all_paths))

# מגבלה אופציונלית
try:
    if MAX_IMAGES is not None:
        all_paths = all_paths[:MAX_IMAGES]
except NameError:
    pass

# פרמטרים של הבאצ'ים
BATCH_SIZE   = 100
SHUFFLE      = False
RESUME       = True

# מאיזה באצ' להתחיל/להפסיק (1-based)
START_BATCH  = 40
END_BATCH    = None

# ליצור ZIP לכל באצ' בסיום (שיהיה מה להוריד תמיד)
AUTO_ZIP_BATCH = True

# כל כמה קבצים לעשות flush למניפסטים
FLUSH_EVERY = 10

if SHUFFLE:
    random.seed(42); random.shuffle(all_paths)

total = len(all_paths)
num_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE
start_b = max(START_BATCH - 1, 0)
end_b   = num_batches if END_BATCH is None else min(END_BATCH, num_batches)

print(f"Found {total} images under {INPUT_DIRS}")
print(f"Will run batches {start_b+1} .. {end_b} (of {num_batches})")

def open_manifest(path):
    write_header = not (os.path.exists(path) and os.path.getsize(path) > 0)
    f = open(path, "a" if not write_header else "w", newline="", encoding="utf-8-sig")
    return f, write_header

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
os.makedirs(DIRS["output"], exist_ok=True)
root_manifest = os.path.join(DIRS["output"], f"manifest_{ts}.csv")

with open(root_manifest, "w", newline="", encoding="utf-8-sig") as mf:
    writer = csv.writer(mf)
    writer.writerow(["batch_id","idx_in_batch","input_path_rel","output_path_rel","n_masks","status"])

    for b in range(start_b, end_b):
        batch_id   = f"batch_{b+1:03d}"
        batch_dir  = os.path.join(DIRS["output"], batch_id)
        os.makedirs(batch_dir, exist_ok=True)

        # manifest פר-באצ' (append אם קיים)
        batch_manifest = os.path.join(batch_dir, "manifest.csv")
        bf, need_header = open_manifest(batch_manifest)
        bwriter = csv.writer(bf)
        if need_header:
            bwriter.writerow(["idx_in_batch","input_path_rel","output_path_rel","n_masks","status"])

        s = b * BATCH_SIZE
        e = min(s + BATCH_SIZE, total)
        batch_paths = all_paths[s:e]
        print(f"\n=== {batch_id} ({s+1}-{e}/{total}) ===")

        for j, p in enumerate(batch_paths, 1):
            base     = os.path.splitext(os.path.basename(p))[0]
            # שמור מסכת סגמנטציה צבעונית בלבד
            out_path = os.path.join(batch_dir, f"{base}_mask.png")

            status, n_masks = "ok", ""
            if RESUME and os.path.exists(out_path):
                status = "skipped_exists"
            else:
                img_bgr = cv2.imread(p, cv2.IMREAD_UNCHANGED)
                if img_bgr is None:
                    status = "read_error"
                else:
                    try:
                        img_rgb = to_rgb(img_bgr)             # np.uint8 RGB
                        h, w    = img_rgb.shape[:2]
                        masks   = mask_generator.generate(img_rgb)
                        n_masks = len(masks)

                        # צביעה של המופעים למסכת RGB בלבד (ללא בלנד עם המקור)
                        mask_rgb = colorize_instances_sam2(masks, h, w)  # np.uint8 RGB (H,W,3)

                        # כתיבה לדיסק (ל־BGR עבור OpenCV)
                        cv2.imwrite(out_path, cv2.cvtColor(mask_rgb, cv2.COLOR_RGB2BGR))

                    except Exception as e:
                        status = f"error:{type(e).__name__}"
                        n_masks = ""

            rel_in_root  = os.path.relpath(p,        DIRS.get("root", os.path.commonpath(all_paths) if all_paths else "."))
            rel_out_root = os.path.relpath(out_path, DIRS.get("root", os.path.commonpath(all_paths) if all_paths else "."))

            # כתיבה למניפסט הראשי ולמניפסט של הבאצ'
            writer.writerow([batch_id, j, rel_in_root, rel_out_root, n_masks, status])
            bwriter.writerow([j, rel_in_root, rel_out_root, n_masks, status])

            # keep-alive פלט קטן ושמירת מניפסטים
            if (j % FLUSH_EVERY) == 0 or j == len(batch_paths):
                print(f"  progress: {j}/{len(batch_paths)}")
                mf.flush(); bf.flush()

        bf.close()

        # ZIP אוטומטי לבאצ' הזה (עם קבצי *_mask.png)
        if AUTO_ZIP_BATCH:
            zip_path = batch_dir + ".zip"
            try:
                if os.path.exists(zip_path):
                    os.remove(zip_path)  # כדי לרענן אם נוספו קבצים
                shutil.make_archive(batch_dir, "zip", batch_dir)
                print(f"Zipped: {zip_path}")
            except Exception as e:
                print("ZIP error:", e)

print("\n✔️ Done.")
print("Root manifest:", root_manifest)
print("Per-batch manifests & ZIPs are under:", DIRS["output"])


Found 5376 images under ['/kaggle/input']
Will run batches 40 .. 54 (of 54)

=== batch_040 (3901-4000/5376) ===
  progress: 10/100
  progress: 20/100
  progress: 30/100
  progress: 40/100
  progress: 50/100
  progress: 60/100
  progress: 70/100
  progress: 80/100
  progress: 90/100
  progress: 100/100
Zipped: /kaggle/working/SEM_SAM2/output/batch_040.zip

=== batch_041 (4001-4100/5376) ===


KeyboardInterrupt: 

In [None]:
# !tar -C /kaggle -czf /kaggle/working/sem_sam2_output_$(date +%Y%m%d_%H%M%S).tgz working/SEM_SAM2/output
# !ls -lh /kaggle/working | grep tgz