<a href="https://colab.research.google.com/github/AfifaMasood/AfifaMasood/blob/main/testing4_yolo%2Bresnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
pip install ultralytics pandas opencv-python deep_sort_realtime


Collecting ultralytics
  Downloading ultralytics-8.3.75-py3-none-any.whl.metadata (35 kB)
Collecting deep_sort_realtime
  Downloading deep_sort_realtime-1.3.2-py3-none-any.whl.metadata (12 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.meta

In [None]:
import tensorflow as tf
import torch
import torch.nn as nn
import cv2
import numpy as np
from PIL import Image
from torchvision import transforms, models
from ultralytics import YOLO


# Constants
IMAGE_HEIGHT, IMAGE_WIDTH = 256, 256
CLASSES_LIST = ["Normal", "Shoplifting"]
HEIGHT_THRESHOLD = 250  # Minimum height for processing a person

def preprocess_frame(frame):
    resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
    return resized_frame / 255.0

# Load frame-level model (TensorFlow)
frame_model = tf.keras.models.load_model('/content/drive/MyDrive/60_model_bs_256_frame2.h5')

def is_clear_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
    mean_intensity = np.mean(gray)

    sharpness_threshold = 50
    brightness_threshold = 40

    return laplacian_var > sharpness_threshold and mean_intensity > brightness_threshold

def process_video(video_path, output_video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_rate = int(video_capture.get(cv2.CAP_PROP_FPS))
    width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (width, height))

    yolov8_model = YOLO("yolov8n.pt")
    resnet_model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
    num_ftrs = resnet_model.fc.in_features
    resnet_model.fc = nn.Linear(num_ftrs, 2)
    resnet_model.load_state_dict(torch.load("/content/drive/MyDrive/resnet34_shoplifting.pth"))
    resnet_model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    resnet_model.to(device)

    transform = transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        # Frame-level prediction
        processed_frame = preprocess_frame(frame)
        input_data = np.expand_dims(processed_frame, axis=0)
        predictions = frame_model.predict(input_data, verbose=0)[0]
        normal_probability = predictions[0]
        shoplifting_probability = predictions[1]


         # Round probabilities to 2 decimal places
        normal_probability = round(normal_probability, 2)
        shoplifting_probability = round(shoplifting_probability, 2)

        # Check if both probabilities are equal (0.50) or shoplifting_probability >= 0.50
        if normal_probability == shoplifting_probability or shoplifting_probability >= 0.50:
          predicted_class = 1  # Force prediction to "Shoplifting"
        else:
           predicted_class = np.argmax(predictions)

        predicted_label = CLASSES_LIST[predicted_class]
        predicted_prob = max(normal_probability, shoplifting_probability)

        # Draw frame-level prediction text
        text = f"Prediction: {predicted_label} ({predictions[1]:.2f})"
        cv2.putText(frame, text, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        results = yolov8_model(frame)

        persons = []
        for box, cls in zip(results[0].boxes.xyxy, results[0].boxes.cls):
            if int(cls) == 0:  # Only process persons
                x1, y1, x2, y2 = map(int, box.cpu().numpy())
                height = y2 - y1
                if height > HEIGHT_THRESHOLD:
                    persons.append((height, x1, y1, x2, y2))

        if predicted_label == "Normal":
            # Draw green bounding boxes for all persons and continue
            for _, x1, y1, x2, y2 in persons:
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, "Normal", (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        else:
            if persons:
                # Select the closest person (highest bounding box)
                closest_person = max(persons, key=lambda p: p[0])
                _, x1, y1, x2, y2 = closest_person

                person_image = frame[y1:y2, x1:x2]
                if is_clear_image(person_image):
                    person_image = cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB)
                    pil_image = Image.fromarray(person_image)
                    person_image = transform(pil_image).unsqueeze(0).to(device)

                    with torch.no_grad():
                        output = resnet_model(person_image)
                        _, predicted = torch.max(output, 1)
                        person_label = "Shoplifting" if predicted.item() == 1 else "Normal"
                        color = (0, 0, 255) if person_label == "Shoplifting" else (0, 255, 0)

                    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                    cv2.putText(frame, person_label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        out.write(frame)

    video_capture.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Processing complete. Output saved as {output_video_path}")

# ✅ Call the function correctly
process_video(
    "/content/drive/MyDrive/FYP Dataset/Shoplifting/Shoplifting-72.mp4",
    "/content/drive/MyDrive/bounding_box_folder2/Shoplifting-72_output_video.mp4"
)





0: 384x640 4 persons, 141.3ms
Speed: 6.9ms preprocess, 141.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 132.4ms
Speed: 4.8ms preprocess, 132.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 215.2ms
Speed: 5.5ms preprocess, 215.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 201.8ms
Speed: 4.5ms preprocess, 201.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 223.6ms
Speed: 6.6ms preprocess, 223.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 200.5ms
Speed: 4.5ms preprocess, 200.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 212.5ms
Speed: 7.1ms preprocess, 212.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 198.3ms
Speed: 7.0ms preprocess, 198.3ms inference, 1.5ms postprocess per 