In [None]:
!pip install ultralytics transformers torch torchvision opencv-python-headless

import cv2
import torch
import numpy as np
from ultralytics import YOLO
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from datetime import datetime

# -----------------------------
# Load YOLOv8 model (object detection)
# -----------------------------
yolo_model = YOLO("yolov8n.pt")  # lightweight YOLOv8 model

# -----------------------------
# Load CLIP model (for classification)
# -----------------------------
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model.to(device)

# -----------------------------
# Define labels
# -----------------------------
animal_classes = [
    "tiger", "lion", "leopard", "cheetah", "wolf", "bear", "deer",
    "elephant", "monkey", "snake", "rabbit", "wild boar", "bird", "peacock"
]
dangerous_animals = {"tiger", "lion", "leopard", "cheetah", "wolf", "bear", "snake"}

# Water bodies / landscapes for CLIP zero-shot
water_landscape_classes = [
    "river", "stream", "lake", "pond", "waterfall", "cliff", "mud",
    "poisonous plants", "rocks", "fallen tree", "dense forest", "trail", "cliff", "volcano"
]

# Combined CLIP classes
clip_classes = animal_classes + ["person"] + water_landscape_classes

# -----------------------------
# CLIP classification function
# -----------------------------
def classify_clip(crop_img):
    image = Image.fromarray(crop_img)
    inputs = clip_processor(text=clip_classes, images=image, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = clip_model(**inputs)
        probs = outputs.logits_per_image.softmax(dim=1)
    best_idx = probs.argmax().item()
    return clip_classes[best_idx], float(probs[0, best_idx])

# -----------------------------
# Main video stream processing
# -----------------------------
def process_stream(url, fps=2):
    cap = cv2.VideoCapture(url)
    if not cap.isOpened():
        print("❌ Could not open stream")
        return

    # Determine frame interval
    stream_fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(stream_fps / fps) if stream_fps > 0 else 15
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Stream ended or failed. Reconnecting...")
            cap = cv2.VideoCapture(url)
            continue

        frame_count += 1
        if frame_count % frame_interval != 0:
            continue

        detected_animals = []
        detected_person = False
        detected_water = []

        # Run YOLO detection
        results = yolo_model(frame, verbose=False)
        for r in results:
            for box in r.boxes:
                cls_id = int(box.cls[0].item())
                label = yolo_model.names[cls_id]
                x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
                crop = frame[y1:y2, x1:x2]

                if label in ["cat", "dog", "bird", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe"]:
                    if crop.size > 0:
                        animal_name, prob = classify_clip(crop)
                        detected_animals.append((animal_name, prob))
                elif label == "person":
                    detected_person = True

        # Run CLIP full-frame classification for water bodies / landscapes
        full_label, full_prob = classify_clip(frame)
        if full_label in water_landscape_classes:
            detected_water.append((full_label, full_prob))

        # -----------------------------
        # Generate alerts
        # -----------------------------
        for animal, prob in detected_animals:
            if animal in dangerous_animals:
                print(f"⚠ {datetime.now()} Dangerous animal detected: {animal} (confidence: {prob:.2f})")
            else:
                print(f"✅ {datetime.now()} Animal detected: {animal} (confidence: {prob:.2f})")

        if detected_person:
            print(f"👤 {datetime.now()} Person detected nearby.")

        for water, prob in detected_water:
            print(f"💧 {datetime.now()} Water body / hazard detected: {water} (confidence: {prob:.2f})")

    cap.release()
    cv2.destroyAllWindows()

# -----------------------------
# Run
# -----------------------------
url = "YOUR_VIDEO_STREAM_URL_HERE"
process_stream(url)




Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

👤 2025-09-20 05:58:12.821911 Person detected nearby.
💧 2025-09-20 05:58:12.822303 Water body / hazard detected: stream (confidence: 0.41)
💧 2025-09-20 05:58:13.187130 Water body / hazard detected: stream (confidence: 0.81)
✅ 2025-09-20 05:58:13.815247 Animal detected: rabbit (confidence: 0.73)
✅ 2025-09-20 05:58:13.815370 Animal detected: rabbit (confidence: 0.80)
⚠ 2025-09-20 05:58:14.222573 Dangerous animal detected: lion (confidence: 0.74)
⚠ 2025-09-20 05:58:14.800707 Dangerous animal detected: lion (confidence: 0.99)
⚠ 2025-09-20 05:58:14.800809 Dangerous animal detected: bear (confidence: 0.32)
⚠ 2025-09-20 05:58:15.320883 Dangerous animal detected: lion (confidence: 0.98)
⚠ 2025-09-20 05:58:15.321037 Dangerous animal detected: lion (confidence: 0.98)
✅ 2025-09-20 05:58:15.321061 Animal detected: person (confidence: 0.37)
⚠ 2025-09-20 05:58:15.792095 Dangerous animal detected: lion (confidence: 0.69)
✅ 2025-09-20 05:58:15.792198 Animal detected: monkey (confidence: 0.67)
⚠ 2025-09

KeyboardInterrupt: 