In [3]:
import os
import json
import csv
import wave
import cv2
import numpy as np
import pandas as pd
import ffmpeg
import joblib
import tensorflow as tf
import mediapipe as mp
import torch  # For YOLO object detection
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import InputLayer as KerasInputLayer  # For custom layer

# Define a custom InputLayer to address the 'batch_shape' issue during model loading
class CustomInputLayer(KerasInputLayer):
    def _init_(self, **kwargs):
        if 'batch_shape' in kwargs:
            kwargs['batch_input_shape'] = kwargs.pop('batch_shape')
        super(CustomInputLayer, self)._init_(**kwargs)

# Scene detection imports
from scenedetect import open_video, SceneManager
from scenedetect.detectors import ContentDetector, ThresholdDetector

import vosk
import torch
import torch.nn as nn

class Custom3DCNN(nn.Module):
    def __init__(self, num_classes):
        super(Custom3DCNN, self).__init__()
        self.conv1 = nn.Conv3d(in_channels=3, out_channels=32, kernel_size=(3, 3, 3), stride=1, padding=1)
        self.bn1 = nn.BatchNorm3d(32)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv2 = nn.Conv3d(32, 64, kernel_size=(3, 3, 3), stride=1, padding=1)
        self.bn2 = nn.BatchNorm3d(64)
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), stride=1, padding=1)
        self.bn3 = nn.BatchNorm3d(128)
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        # Adjust the flattening dimension based on your input size.
        self.fc1 = nn.Linear(128 * 3 * 14 * 14, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool1(self.relu(self.bn1(self.conv1(x))))
        x = self.pool2(self.relu(self.bn2(self.conv2(x))))
        x = self.pool3(self.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x



In [4]:
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import InputLayer as KerasInputLayer
print("Imports succeeded!")


Imports succeeded!


In [None]:
import os
import json
import csv
import wave
import cv2
import numpy as np
import pandas as pd
import ffmpeg
import joblib
import tensorflow as tf
import mediapipe as mp
import torch  # For YOLO object detection
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import InputLayer as KerasInputLayer  # For custom layer

# -------------------- Global Model Loading --------------------
# Load Face Recognition Models
vgg_face_descriptor = load_model('vgg_face_descriptor.h5', custom_objects={'InputLayer': KerasInputLayer})
scaler = joblib.load('scaler.pkl')
pca = joblib.load('pca.pkl')
clf = joblib.load('svm_classifier.pkl')
le = joblib.load('label_encoder.pkl')

# Load YOLOv5 model (using the x variant for high accuracy & many classes)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5x', pretrained=True)

# Load Action Recognition Model
action_model = torch.load('actionCNN.pth', map_location=torch.device('cpu'))
action_model.eval()

# Define a mapping from action class indices to human-readable labels
action_labels = {
    0: "Walking",
    1: "Running",
    2: "Jumping",
}

# -------------------- Action Detection Function --------------------
def detect_action_from_frame(frame_path):
    image = cv2.imread(frame_path, cv2.IMREAD_COLOR)
    if image is None:
        print("[LOG] detect_action_from_frame: Could not read image from", frame_path)
        return "NoActionDetected"
    
    # Preprocess: resize to 112x112 (as expected by the training), convert BGR->RGB, normalize
    image = cv2.resize(image, (112, 112))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = image.astype(np.float32) / 255.0
    
    # Convert image to a torch tensor with shape (3, 112, 112)
    tensor_image = torch.tensor(image).permute(2, 0, 1)
    
    # Create a time dimension by replicating the frame 24 times.
    time_dim = 24
    # First, add a batch dimension (resulting shape: (1, 3, 112, 112)),
    # then insert a time dimension and repeat along it:
    input_tensor = tensor_image.unsqueeze(0).unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
    # Now input_tensor has shape: (1, 3, 24, 112, 112)
    
    # Print the shape to verify
    print("Input tensor shape for action model:", input_tensor.shape)
    
    with torch.no_grad():
        output = action_model(input_tensor)
    
    predicted_idx = torch.argmax(output, dim=1).item()
    predicted_action = action_labels.get(predicted_idx, "Unknown")
    print(f"[LOG] detect_action_from_frame: Detected action '{predicted_action}'.")
    return predicted_action



# -------------------- Pipeline Functions --------------------
# Function 1: Detect Scenes using scenedetect
def detect_scenes(video_path, content_threshold=50.0, threshold_val=20, min_scene_len=15):
    from scenedetect import open_video, SceneManager
    from scenedetect.detectors import ContentDetector, ThresholdDetector

    video = open_video(video_path)
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=content_threshold))
    scene_manager.add_detector(ThresholdDetector(threshold=threshold_val, min_scene_len=min_scene_len))
    scene_manager.detect_scenes(video=video)
    scene_list = scene_manager.get_scene_list()
    print(f"[LOG] detect_scenes: Detected {len(scene_list)} scene(s) in '{video_path}'.")
    return scene_list

# Function 2: Trim Video Clip using ffmpeg-python
def trim_clip(input_video, start_time, end_time, clip_index, max_duration=4.0, output_dir='clips'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    duration = end_time - start_time
    if duration > max_duration:
        end_time = start_time + max_duration  # Truncate to max_duration
    clip_name = f"squid_Scene_{clip_index:03d}.mp4"
    output_path = os.path.join(output_dir, clip_name)
    
    try:
        ffmpeg.input(input_video, ss=start_time, t=(end_time - start_time)) \
              .output(output_path, vcodec='libx264', acodec='aac') \
              .run(overwrite_output=True)
    except ffmpeg.Error as e:
        print("Error trimming clip:", e)
    
    print(f"[LOG] trim_clip: Trimmed clip {clip_name} from {start_time}s to {end_time}s (max {max_duration}s).")
    return output_path

# Function 3: Extract Key Frame using OpenCV
def extract_key_frame(clip_path, frame_index='middle', output_dir='frames'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    cap = cv2.VideoCapture(clip_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = frame_count / fps if fps > 0 else 0
    
    # Determine target time: use the middle of the clip by default
    target_time = duration / 2 if frame_index == 'middle' else 0
    target_frame = int(target_time * fps)
    
    cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame)
    ret, frame = cap.read()
    if ret:
        frame_file_name = os.path.basename(clip_path).split('.')[0] + "_keyframe.jpg"
        frame_path = os.path.join(output_dir, frame_file_name)
        cv2.imwrite(frame_path, frame)
        print(f"[LOG] extract_key_frame: Extracted key frame at {target_time:.2f}s -> {frame_path}")
        cap.release()
        return frame_path
    else:
        cap.release()
        print("[LOG] extract_key_frame: Failed to extract frame.")
        return None

# Function 4: Speech-to-Text using Vosk (with ffmpeg for audio extraction)
def speech_to_text(clip_path, model_path="model"):
    temp_audio = "temp_audio.wav"
    
    try:
        ffmpeg.input(clip_path).output(temp_audio, ac=1, ar='16k').run(overwrite_output=True)
    except ffmpeg.Error as e:
        print("Error extracting audio:", e)
        return ""
    
    wf = wave.open(temp_audio, "rb")
    print("[LOG] speech_to_text: Checking audio format...")
    print("    Channels:", wf.getnchannels())
    print("    Sample Width:", wf.getsampwidth())
    print("    Frame Rate:", wf.getframerate())
    
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
        wf.close()
        os.remove(temp_audio)
        raise ValueError("Audio must be WAV mono PCM at 16kHz.")
    
    import vosk
    model_instance = vosk.Model(lang="en-us") if model_path == "model" else vosk.Model(model_path)
    rec = vosk.KaldiRecognizer(model_instance, wf.getframerate())
    
    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result_json = json.loads(rec.Result())
            results.append(result_json)
    
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)
    wf.close()
    os.remove(temp_audio)
    
    recognized_texts = [r["text"] for r in results if "text" in r]
    full_text = " ".join(recognized_texts)
    print(f"[LOG] speech_to_text: Transcribed {len(full_text.split())} word(s).")
    return full_text

# Function 5: Face Detection & Recognition using MediaPipe
def recognize_face_mediapipe(frame_path):
    mp_face_detection = mp.solutions.face_detection
    face_detection = mp_face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.5)
    
    image = cv2.imread(frame_path)
    if image is None:
        print("[LOG] recognize_face_mediapipe: Could not read image from", frame_path)
        return "NoFaceDetected"
    
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = face_detection.process(image_rgb)
    
    if not results.detections:
        print("[LOG] recognize_face_mediapipe: No face detected in the frame.")
        return "NoFaceDetected"
    
    detection = results.detections[0]
    bboxC = detection.location_data.relative_bounding_box
    ih, iw, _ = image.shape
    x_min = max(0, int(bboxC.xmin * iw))
    y_min = max(0, int(bboxC.ymin * ih))
    width = int(bboxC.width * iw)
    height = int(bboxC.height * ih)
    face_snippet = image[y_min:y_min+height, x_min:x_min+width]
    
    identity = recognize_face_from_snippet(face_snippet)
    return identity

def recognize_face_from_snippet(face_snippet):
    global vgg_face_descriptor, scaler, pca, clf, le
    img = cv2.cvtColor(face_snippet, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = img.astype(np.float32) / 255.0
    img_expanded = np.expand_dims(img, axis=0)
    
    embedding_vector = vgg_face_descriptor.predict(img_expanded)[0]
    embedding_vector = scaler.transform([embedding_vector])
    embedding_vector = pca.transform(embedding_vector)
    y_pred = clf.predict(embedding_vector)
    predicted_name = le.inverse_transform(y_pred)[0]
    return predicted_name

# Function 6: Object Detection using YOLOv5
def detect_objects_yolo(image_path, conf_threshold=0.5):
    image = cv2.imread(image_path)
    if image is None:
        print(f"[LOG] detect_objects_yolo: Could not read image from {image_path}")
        return []
    
    results = yolo_model(image)
    df = results.pandas().xyxy[0]
    df = df[df['confidence'] >= conf_threshold]
    
    img_area = image.shape[0] * image.shape[1]
    detections = []
    for index, row in df.iterrows():
        x1, y1, x2, y2 = row['xmin'], row['ymin'], row['xmax'], row['ymax']
        bbox_area = (x2 - x1) * (y2 - y1)
        prominence = bbox_area / img_area
        detections.append({
            "class": row['name'],
            "confidence": float(row['confidence']),
            "prominence": prominence
        })
    
    print(f"[LOG] detect_objects_yolo: Found {len(detections)} high-confidence objects.")
    return detections

def recognize_text(image_path):
    try:
        # Open the image using PIL and run pytesseract OCR
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        print(f"[LOG] recognize_text: OCR found {len(text.split())} words.")
        return text.strip()
    except Exception as e:
        print("[LOG] recognize_text: Error processing OCR:", e)
        return ""
    
                           
# -------------------- Main Processing Pipeline --------------------
def process_video(video_path):
    print(f"[LOG] process_video: Starting processing for '{video_path}'...")
    
    # 1) Scene Detection
    scenes = detect_scenes(video_path, content_threshold=10.0, threshold_val=20, min_scene_len=15)
    print(f"[DEBUG] process_video: Scenes -> {scenes}")
    if not scenes:
        print("[DEBUG] process_video: No scenes detected! The CSV will likely be empty.")
    
    results = []
    for idx, scene in enumerate(scenes, start=1):
        start_sec = scene[0].get_seconds()
        end_sec = scene[1].get_seconds()
        print(f"[LOG] process_video: Processing scene #{idx} | Start: {start_sec:.2f}s, End: {end_sec:.2f}s.")
        
        # 2) Trim the scene
        clip_path = trim_clip(video_path, start_sec, end_sec, idx, max_duration=4.0, output_dir='clips')
        
        # 3) Extract key frame
        frame_path = extract_key_frame(clip_path, frame_index='middle', output_dir='frames')
        
        # 4) Speech-to-Text from the clip
        recognized_speech = speech_to_text(clip_path)
        
        # 5) Face detection & recognition on the key frame
        face_identity = recognize_face_mediapipe(frame_path)
        
        # 6) Object detection on the key frame
        object_detections = detect_objects_yolo(frame_path)
        
        # 7) Action detection on the key frame
        action_detected = detect_action_from_frame(frame_path)

        ocr_text = recognize_text(frame_path)
        
        # 8) Collect results for this scene
        record = {
            "clip_name": os.path.basename(clip_path),
            "frame_name": os.path.basename(frame_path) if frame_path else "None",
            "face_detected": face_identity,
            "caption": recognized_speech,
            "object_detection": json.dumps(object_detections),
            "ocr_text": ocr_text,
            "action_detected": action_detected,
            "timestamp_start": start_sec,
            "timestamp_end": end_sec
        }
        results.append(record)
        print(f"[LOG] process_video: Scene #{idx} processing complete.\n")
    
    print(f"[DEBUG] process_video: Final results -> {results}")
    
    # Write results to CSV
    csv_filename = "final_results.csv"
    fieldnames = ["clip_name", "frame_name", "face_detected", "caption", "object_detection", "ocr_text" ,"action_detected", "timestamp_start", "timestamp_end"]
    with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)
    
    print(f"[LOG] process_video: All scenes processed. Results saved to '{csv_filename}'.")
    print("Final Aggregated Results:")
    for record in results:
        print(record)

# -------------------- Main Guard --------------------
if __name__ == "__main__":
    print("[DEBUG] Starting main pipeline process...")
    input_video = "video.mp4"
    process_video(input_video)


Using cache found in C:\Users\acer/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-3-2 Python-3.10.13 torch-2.0.1+cpu CPU

Fusing layers... 
YOLOv5x summary: 444 layers, 86705005 parameters, 0 gradients
Adding AutoShape... 
INFO:pyscenedetect:Detecting scenes...


[DEBUG] Starting main pipeline process...
[LOG] process_video: Starting processing for 'video.mp4'...
[LOG] detect_scenes: Detected 83 scene(s) in 'video.mp4'.
[DEBUG] process_video: Scenes -> [(00:00:00.000 [frame=0, fps=30.000], 00:00:03.133 [frame=94, fps=30.000]), (00:00:03.133 [frame=94, fps=30.000], 00:00:05.533 [frame=166, fps=30.000]), (00:00:05.533 [frame=166, fps=30.000], 00:00:07.633 [frame=229, fps=30.000]), (00:00:07.633 [frame=229, fps=30.000], 00:00:08.333 [frame=250, fps=30.000]), (00:00:08.333 [frame=250, fps=30.000], 00:00:10.633 [frame=319, fps=30.000]), (00:00:10.633 [frame=319, fps=30.000], 00:00:13.833 [frame=415, fps=30.000]), (00:00:13.833 [frame=415, fps=30.000], 00:00:19.400 [frame=582, fps=30.000]), (00:00:19.400 [frame=582, fps=30.000], 00:00:22.933 [frame=688, fps=30.000]), (00:00:22.933 [frame=688, fps=30.000], 00:00:26.433 [frame=793, fps=30.000]), (00:00:26.433 [frame=793, fps=30.000], 00:00:30.833 [frame=925, fps=30.000]), (00:00:30.833 [frame=925, fps=