# Разбор решения хакатона
## Дообучение SAM2

In [None]:
!git clone https://github.com/facebookresearch/segment-anything-2
%cd /content/segment-anything-2
!pip install -q -e .

In [None]:
!wget -O sam2_hiera_tiny.pt "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt"
!wget -O sam2_hiera_small.pt "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt"
!wget -O sam2_hiera_base_plus.pt "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_base_plus.pt"
!wget -O sam2_hiera_large.pt "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt"

In [None]:
%cd /content/segment-anything-2

In [None]:
!wget https://storage.yandexcloud.net/ds-ods/files/files/edd91118/train1.zip
!wget https://storage.yandexcloud.net/ds-ods/files/files/e5a98368/train2.zip
!wget https://storage.yandexcloud.net/ds-ods/files/files/a95eacf7/train3.zip
!wget https://storage.yandexcloud.net/ds-ods/files/files/a9a3642d/train4.zip
!wget https://storage.yandexcloud.net/ds-ods/files/files/adacb253/val.zip
!wget https://storage.yandexcloud.net/ds-ods/files/files/b5ac09fa/annotations.zip

In [None]:
!unzip train1.zip
!unzip train2.zip
!unzip train3.zip
!unzip train4.zip
!unzip val.zip
!unzip annotations.zip

In [None]:
!mkdir train

In [None]:
import shutil
import os

train_path = "train"
for folder_name in ["train1", "train2", "train3", "train4"]:
  for file_name in os.listdir(folder_name):
    src = os.path.join(folder_name, file_name)
    shutil.move(src, train_path)

In [None]:

import os
import pandas as pd
import cv2
import torch
import torch.nn.utils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
import json

import random
import torch.nn.functional as F

with open("val_annotations.json", "r") as f:
  val_data = json.load(f)

with open("train_annotations.json", "r") as f:
  train_data = json.load(f)

In [None]:
from pycocotools import _mask

In [None]:
val_data['images'][0]

Зафиксировать seed для воспроизведения обучения.

In [None]:
def set_seeds():
    SEED_VALUE = 42
    random.seed(SEED_VALUE)
    np.random.seed(SEED_VALUE)
    torch.manual_seed(SEED_VALUE)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED_VALUE)
        torch.cuda.manual_seed_all(SEED_VALUE)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

set_seeds()

## Считывание данных для обучения

In [None]:
import cv2
def read_batch(data, data_path):
   # Select a random entry
  ent = data['annotations'][np.random.randint(len(data['annotations']))]

  ind = 0
  while data['images'][ind]['id'] != ent['image_id']:
    ind += 1
  image_path = data['images'][ind]['file_name']
  image_path = os.path.join(data_path, image_path)
  Img = cv2.imread(image_path)[..., ::-1]  # Convert BGR to RGB
  binary_mask = np.zeros((693, 1344), dtype=np.uint8)
  for pts in ent['segmentation']:
    # Шаг 1: Разделить на пары (x, y)
    points = [[pts[i], pts[i + 1]] for i in range(0, len(pts), 2)]

    # Шаг 2: Обернуть в список (даже если один полигон)
    polygon = [points]

    # Шаг 3: Преобразовать в NumPy массив с типом int32
    pts_array = np.array(polygon, dtype=np.int32)
    cv2.fillPoly(binary_mask, pts_array , color=1)

  # Erode the combined binary mask to avoid boundary points
  eroded_mask = cv2.erode(binary_mask, np.ones((5, 5), np.uint8), iterations=1)

  points = []
  # Get all coordinates inside the eroded mask and choose random points
  coords = np.argwhere(eroded_mask > 0)
  if len(coords) > 0:
      yx = coords[np.random.randint(len(coords))]  # Randomly select a point
      points.append([yx[1], yx[0]])  # Append in [x, y] format (col, row)

  points = np.array(points)
  binary_mask = np.expand_dims(binary_mask, axis=-1)  # Now shape is (1024, 1024, 1)
  binary_mask = binary_mask.transpose((2, 0, 1))
  points = np.expand_dims(points, axis=1)

  # Return the image, binarized mask, points, and number of masks
  return Img, binary_mask, points, 1

## Подготовка кода для обучения

In [None]:
sam2_checkpoint = "sam2_hiera_small.pt"  # @param ["sam2_hiera_tiny.pt", "sam2_hiera_small.pt", "sam2_hiera_base_plus.pt", "sam2_hiera_large.pt"]
model_cfg = "sam2_hiera_s.yaml" # @param ["sam2_hiera_t.yaml", "sam2_hiera_s.yaml", "sam2_hiera_b+.yaml", "sam2_hiera_l.yaml"]

sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cuda")
predictor = SAM2ImagePredictor(sam2_model)

In [None]:
# Train mask decoder.
predictor.model.sam_mask_decoder.train(True)

# Train prompt encoder.
predictor.model.sam_prompt_encoder.train(True)

# Configure optimizer.
optimizer=torch.optim.AdamW(params=predictor.model.parameters(),lr=0.0001,weight_decay=1e-4) #1e-5, weight_decay = 4e-5

# Mix precision.
scaler = torch.cuda.amp.GradScaler()

# No. of steps to train the model.
NO_OF_STEPS = 3000 # @param

# Fine-tuned model name.
FINE_TUNED_MODEL_NAME = "fine_tuned_sam2"

In [None]:
!pwd

In [None]:
# Initialize scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=500, gamma=0.2) # 500 , 250, gamma = 0.1
accumulation_steps = 4  # Number of steps to accumulate gradients before updating

for step in range(1, NO_OF_STEPS + 1):
   with torch.cuda.amp.autocast():
       image, mask, input_point, num_masks = read_batch(train_data, 'train')
       if image is None or mask is None or num_masks == 0:
           continue

       input_label = np.ones((num_masks, 1))
       if not isinstance(input_point, np.ndarray) or not isinstance(input_label, np.ndarray):
           continue

       if input_point.size == 0 or input_label.size == 0:
           continue
       image = image.copy()
       predictor.set_image(image)
       mask_input, unnorm_coords, labels, unnorm_box = predictor._prep_prompts(input_point, input_label, box=None, mask_logits=None, normalize_coords=True)
       if unnorm_coords is None or labels is None or unnorm_coords.shape[0] == 0 or labels.shape[0] == 0:
           continue

       sparse_embeddings, dense_embeddings = predictor.model.sam_prompt_encoder(
           points=(unnorm_coords, labels), boxes=None, masks=None,
       )

       batched_mode = unnorm_coords.shape[0] > 1
       high_res_features = [feat_level[-1].unsqueeze(0) for feat_level in predictor._features["high_res_feats"]]
       low_res_masks, prd_scores, _, _ = predictor.model.sam_mask_decoder(
           image_embeddings=predictor._features["image_embed"][-1].unsqueeze(0),
           image_pe=predictor.model.sam_prompt_encoder.get_dense_pe(),
           sparse_prompt_embeddings=sparse_embeddings,
           dense_prompt_embeddings=dense_embeddings,
           multimask_output=True,
           repeat_image=batched_mode,
           high_res_features=high_res_features,
       )
       prd_masks = predictor._transforms.postprocess_masks(low_res_masks, predictor._orig_hw[-1])

       gt_mask = torch.tensor(mask.astype(np.float32)).cuda()
       prd_mask = torch.sigmoid(prd_masks[:, 0])
       seg_loss = (-gt_mask * torch.log(prd_mask + 0.000001) - (1 - gt_mask) * torch.log((1 - prd_mask) + 0.00001)).mean()

       inter = (gt_mask * (prd_mask > 0.5)).sum(1).sum(1)
       iou = inter / (gt_mask.sum(1).sum(1) + (prd_mask > 0.5).sum(1).sum(1) - inter)
       score_loss = torch.abs(prd_scores[:, 0] - iou).mean()
       loss = seg_loss + score_loss * 0.05

       # Apply gradient accumulation
       loss = loss / accumulation_steps
       scaler.scale(loss).backward()

       # Clip gradients
       torch.nn.utils.clip_grad_norm_(predictor.model.parameters(), max_norm=1.0)

       if step % accumulation_steps == 0:
           scaler.step(optimizer)
           scaler.update()
           predictor.model.zero_grad()

       # Update scheduler
       scheduler.step()

       if step % 500 == 0:
           FINE_TUNED_MODEL = FINE_TUNED_MODEL_NAME + "_" + str(step) + ".torch"
           torch.save(predictor.model.state_dict(), FINE_TUNED_MODEL)

       if step == 1:
           mean_iou = 0

       mean_iou = mean_iou * 0.99 + 0.01 * np.mean(iou.cpu().detach().numpy())

       if step % 100 == 0:
           print("Step " + str(step) + ":\t", "Accuracy (IoU) = ", mean_iou)

## Установка необходимых библиотек для инференса Megadetecter

In [None]:
!sudo apt-get update -y
!sudo apt-get install python3.8 python3.8-dev python3.8-distutils libpython3.8-dev

#change alternatives
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2

#Check that it points at the right location
!python3 --version

In [None]:
# install pip
!curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
!python3 get-pip.py --force-reinstall

#install colab's dependencies
!python3 -m pip install ipython ipython_genutils ipykernel jupyter_console prompt_toolkit httplib2 astor

In [None]:
# link to the old google package
!ln -s /usr/local/lib/python3.10/dist-packages/google \
       /usr/local/lib/python3.8/dist-packages/google

!sed -i "s/from IPython.utils import traitlets as _traitlets/import traitlets as _traitlets/" /usr/local/lib/python3.8/dist-packages/google/colab/*.py
!sed -i "s/from IPython.utils import traitlets/import traitlets/" /usr/local/lib/python3.8/dist-packages/google/colab/*.py

In [None]:
#Install PytorchWildlife
!pip install pytorchwildlife

In [None]:
import os
import torch
from PytorchWildlife.models import detection as pw_detection
from PytorchWildlife import utils as pw_utils

In [None]:
# Setting the device to use for computations ('cuda' indicates GPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if DEVICE == "cuda":
    torch.cuda.set_device(0)

## Инициализация Megadetecter

In [None]:
detection_model = pw_detection.MegaDetectorV6(device=DEVICE, pretrained=True, version="MDV6-yolov10-e")

In [None]:
! wget https://storage.yandexcloud.net/ds-ods/files/files/2ad601fd/test.zip
! unzip test.zip

In [None]:
tgt_folder_path = "test"
results_md = detection_model.batch_image_detection(tgt_folder_path, batch_size=16)

In [None]:
results_md[0]

In [None]:
from pycocotools import mask as coco_mask
def masks_to_rle(binary_mask):
    """
    Convert binary_mask to COCO RLE format.

    Args:
        binary_mask.

    Returns:
        list: List of RLE-encoded masks.
    """
    binary_mask = binary_mask.astype(np.uint8)

    # Encode mask using COCO RLE format
    rle = coco_mask.encode(np.asfortranarray(binary_mask))

    print(rle)
    # Ensure COCO compliance
    rle['counts'] = rle['counts']


    return rle

In [None]:
results_md[0]['detections']

In [None]:
sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cuda")

predictor = SAM2ImagePredictor(sam2_model)
FINE_TUNED_MODEL_WEIGHTS = "/content/segment-anything-2/fine_tuned_sam2_3000.torch"
predictor.model.load_state_dict(torch.load(FINE_TUNED_MODEL_WEIGHTS))

image_path = results_md[0]['img_id']
Img = cv2.imread(image_path)[..., ::-1]

with torch.no_grad():
   image = Img.copy()
   predictor.set_image(image)
   cx = (results_md[0]['detections'].xyxy[0][0] + results_md[0]['detections'].xyxy[0][2])/2
   cy = (results_md[0]['detections'].xyxy[0][1] + results_md[0]['detections'].xyxy[0][3])/2
   input_points = np.array([[cx, cy]])
   masks, scores, logits = predictor.predict(
       point_coords=[input_points],
       point_labels=np.ones([input_points.shape[0], 1])
   )

In [None]:

import numpy as np

predictions = []
for result in results_md:
  if len(result['normalized_coords']) != 0:
    image_path = result['img_id']
    Img = cv2.imread(image_path)[..., ::-1]
    # Perform inference and predict masks
    with torch.no_grad():
      image = Img.copy()
      predictor.set_image(image)
      for detection in result['detections'].xyxy:
        cx = (detection[0] + detection[2])/2
        cy = (detection[1] + detection[3])/2
        input_points = np.array([[cx, cy]])
        masks, scores, logits = predictor.predict(
            point_coords=[input_points],
            point_labels=np.ones([input_points.shape[0], 1])
        )
        rle = masks_to_rle(masks[0])
        file_name = result['img_id'].split('/')[-1]
        predictions.append({
            "image_name": file_name,
            "category_id": "0",  # ID категории
            "bbox": detection,  # Координаты bounding box
            "score": scores[0],  # Оценка уверенности
            "segmentation": {
                "size" : rle["size"],
                "counts" : str(rle["counts"])
            }  # Сегментация в формате RLE
        })

In [None]:
import os
import json

output_folder = "./"
# Сохранение предсказаний в JSON
predictions_file = os.path.join(output_folder, "submission.json")
with open(predictions_file, "w") as f:
    json.dump(predictions, f, indent=4)

print(f"Обработка завершена. Результаты сохранены в папке: {output_folder}")