In [None]:
import os
import cv2
import numpy as np
from keras.models import load_model
import time
from IPython.display import display
import PIL.Image
import io
import tensorflow as tf
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
from torchvision.transforms import functional as F

model = load_model('WTMRNet2.h5')

categories = ['wave', 'walk', 'turn', 'throw', 'talk', 'stand', 'smile', 'situp', 'sit', 'shake_hands', 'run', 'push', 'punch', 'pour',
              'pick', 'laugh', 'jump', 'hug', 'hit', 'handstand', 'fall_floor', 'eat', 'drink', 'dribble', 'climb_stairs', 'climb', 'clap',
              'chew', 'catch', 'brush_hair']

In [None]:
haar_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_fullbody.xml')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
object_detector = fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.COCO_V1).to(device)
object_detector.eval()

In [None]:
def preprocess_frame(frame):
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame_resized = cv2.resize(frame_gray, (64, 64))
    frame_normalized = frame_resized / 255.0
    return frame_normalized.reshape(1, 64, 64, 1)

In [None]:
def detect_person(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    persons = haar_cascade.detectMultiScale(gray, 1.1, 4)
    return persons

In [None]:
def detect_objects(frame):
    small_frame = cv2.resize(frame, (320, 240))  
    image = cv2.cvtColor(small_frame, cv2.COLOR_BGR2RGB)
    image_tensor = F.to_tensor(image).unsqueeze(0).to(device)
    
    with torch.no_grad():  
        predictions = object_detector(image_tensor)[0]
    
    detected_objects = []
    for label, score, box in zip(predictions['labels'], predictions['scores'], predictions['boxes']):
        if score > 0.5:  
            label_name = COCO_LABELS.get(label.item(), f"Unknown ({label.item()})")  
            detected_objects.append((label_name, box.cpu().numpy()))
    
    return detected_objects

In [None]:
def extract_action_frames(video_path, start_time, end_time, fps):
    cap = cv2.VideoCapture(video_path)
    frame_indices = [int(start_time * fps), int((start_time + end_time) / 2 * fps), int(end_time * fps)]
    extracted_frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            extracted_frames.append(frame)
    cap.release()
    return extracted_frames

In [None]:
def process_video1(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = 0
    persons_info = {}
    detected_objects = []
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        person_boxes = detect_person(frame)
        
        if frame_count % 10 == 0:  
            detected_objects = detect_objects(frame)
        
        environment_context = ", ".join([obj[0] for obj in detected_objects]) if detected_objects else "Unknown"
        
        for idx, (x, y, w, h) in enumerate(person_boxes):
            x1, y1, x2, y2 = x, y, x + w, y + h
            cropped_person = frame[y1:y2, x1:x2]
            input_frame = preprocess_frame(cropped_person)
            predictions = model.predict(input_frame, verbose=0)
            predicted_label = categories[np.argmax(predictions)]
            confidence = np.max(predictions)
            
            if confidence > 0.2:
                timestamp = frame_count / fps
                
                if idx not in persons_info:
                    persons_info[idx] = {}
                if predicted_label not in persons_info[idx]:
                    persons_info[idx][predicted_label] = {'count': 0, 'timestamps': [], 'start_time': None, 'end_time': None, 'context': environment_context}
                
                persons_info[idx][predicted_label]['timestamps'].append(timestamp)
                persons_info[idx][predicted_label]['count'] += 1
                if persons_info[idx][predicted_label]['start_time'] is None:
                    persons_info[idx][predicted_label]['start_time'] = timestamp
                persons_info[idx][predicted_label]['end_time'] = timestamp
        
        frame_count += 1
    cap.release()
    
    total_persons = len(persons_info)
    total_actions = sum(len(actions) for actions in persons_info.values())
    total_duration = frame_count / fps
    
    if total_persons == 1:
        print(f"Total Actions Detected: {total_actions}")
        print(f"Total Duration of video: {total_duration:.2f}s")
        
        for person_id, actions in persons_info.items():
            print(f"\nPerson {person_id + 1}:")
            summary = {}
            
            for action, details in actions.items():
                start_time, end_time = details['start_time'], details['end_time']
                duration = end_time - start_time
                extracted_frames = extract_action_frames(video_path, start_time, end_time, fps)
                
                summary[action] = {
                    'count': details['count'], 'start_time': start_time, 'end_time': end_time,
                    'duration': duration, 'frames': extracted_frames, 'context': details['context']
                }
                
                print(f"- {action} → Timestamp: {start_time:.2f}s | Duration: {duration:.2f}s ({start_time:.2f}s - {end_time:.2f}s)")
                print(f"Context: {details['context']}")
                for frame in extracted_frames:
                    _, buffer = cv2.imencode('.jpg', frame)
                    display(PIL.Image.open(io.BytesIO(buffer)))
            
            max_action = max(summary, key=lambda x: summary[x]['count'])
            min_action = min(summary, key=lambda x: summary[x]['count'])
            longest_action = max(summary, key=lambda x: summary[x]['duration'])
            shortest_action = min(summary, key=lambda x: summary[x]['duration'])
        
            print(f"Most Frequent Action: {max_action} ({summary[max_action]['count']} times)")
            print(f"Least Frequent Action: {min_action} ({summary[min_action]['count']} times)")
            print(f"Action with Longest Duration: {longest_action} ({summary[longest_action]['duration']:.2f}s)")
            print(f"Action with Shortest Duration: {shortest_action} ({summary[shortest_action]['duration']:.2f}s)")
    
    else:
        print(f"Total Persons Detected: {total_persons}")
        print(f"Total Actions Detected: {total_actions}")
        print(f"Total Duration of video: {total_duration:.2f}s")
        
        for person_id, actions in persons_info.items():
            print(f"\nPerson {person_id + 1}:")
            summary = {}
            
            for action, details in actions.items():
                start_time, end_time = details['start_time'], details['end_time']
                duration = end_time - start_time
                extracted_frames = extract_action_frames(video_path, start_time, end_time, fps)
                
                summary[action] = {
                    'count': details['count'], 'start_time': start_time, 'end_time': end_time,
                    'duration': duration, 'frames': extracted_frames, 'context': details['context']
                }
                
                print(f"- {action} → Timestamp: {start_time:.2f}s | Duration: {duration:.2f}s ({start_time:.2f}s - {end_time:.2f}s)")
                print(f"Context: {details['context']}")
                for frame in extracted_frames:
                    _, buffer = cv2.imencode('.jpg', frame)
                    display(PIL.Image.open(io.BytesIO(buffer)))
            
            max_action = max(summary, key=lambda x: summary[x]['count'])
            min_action = min(summary, key=lambda x: summary[x]['count'])
            longest_action = max(summary, key=lambda x: summary[x]['duration'])
            shortest_action = min(summary, key=lambda x: summary[x]['duration'])
        
            print(f"Most Frequent Action: {max_action} ({summary[max_action]['count']} times)")
            print(f"Least Frequent Action: {min_action} ({summary[min_action]['count']} times)")
            print(f"Action with Longest Duration: {longest_action} ({summary[longest_action]['duration']:.2f}s)")
            print(f"Action with Shortest Duration: {shortest_action} ({summary[shortest_action]['duration']:.2f}s)")

In [None]:
process_video1('walking and sitting.mp4')

In [None]:
process_video1('standup.avi')

In [None]:
process_video1('SQA1.mp4')