In [None]:
from ultralytics import YOLO
import cv2
import torch
import os
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# === 1. Обучение YOLOv8 ===
model = YOLO("yolov8s.pt")

results = model.train(
    data="Video.v9i.yolov11/data.yaml",
    epochs=4,
    imgsz=640,
    batch=16,
    name="yolov11_video_aug",
    device=0,
    degrees=10,
    translate=0.1,
    scale=0.5,
    shear=10,
    perspective=0.0005,
    flipud=0.2,
    fliplr=0.5,
    hsv_h=0.015,
    hsv_s=0.7,
    hsv_v=0.4,
    mosaic=1.0,
    mixup=0.2,
    copy_paste=0.1
)

metrics = model.val()
mp = metrics.box.mp
mr = metrics.box.mr
map50 = metrics.box.map50
map5095 = metrics.box.map
f1 = 2 * mp * mr / (mp + mr + 1e-6)

print(f"Precision (mean): {mp:.3f}")
print(f"Recall (mean): {mr:.3f}")
print(f"mAP@0.5: {map50:.3f}")
print(f"mAP@0.5:0.95: {map5095:.3f}")
print(f"F1-score: {f1:.3f}")

# === 2. Настройка BLIP модели ===
device = "cuda" if torch.cuda.is_available() else "cpu"
blip_model_id = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(blip_model_id)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_id).to(device)

# === 3. Обработка видео: YOLO (bbox) + BLIP (описание) ===
video_path = "4_1.MOV"
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter("output_yolo_blip.mp4", fourcc, fps, (width, height))

frame_count = 0

print("▶️ Запуск анализа видео...")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret or frame_count > 1000:  # ограничим обработку
        break

    # === YOLO предсказания и отрисовка bbox ===
    yolo_results = model.predict(frame, imgsz=640, conf=0.25, verbose=False)
    annotated_frame = yolo_results[0].plot()  # frame с боксами
    yolo_classes = yolo_results[0].names
    yolo_detected = set()
    for box in yolo_results[0].boxes.cls:
        yolo_detected.add(yolo_classes[int(box)])

    # === BLIP описание ===
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(image_rgb)
    inputs = processor(images=pil_image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs, max_length=30)
    caption_text = processor.decode(caption_ids[0], skip_special_tokens=True)

    # === Комбинированный текст ===
    yolo_text = ", ".join(yolo_detected) if yolo_detected else "None"
    combined_text = f"YOLO: {yolo_text} | BLIP: {caption_text}"
    print(f"[Frame {frame_count}] {combined_text}")

    # === Текст на видео ===
    cv2.putText(annotated_frame, combined_text[:110], (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2, cv2.LINE_AA)
    out.write(annotated_frame)
    frame_count += 1

cap.release()
out.release()
cv2.destroyAllWindows()
print("✅ Анализ завершён. Сохранено в: output_yolo_blip.mp4")
