In [3]:
import imageio
import os
import imagehash
from PIL import Image
import numpy as np
import cv2
import json
import concurrent.futures

def extract_frames(video_path):
    """ Extracts a fixed number of frames evenly spaced from a video. """
    cap = cv2.VideoCapture(video_path)
    frames = []
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break  # Exit loop if no more frames are available
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
        frames.append(frame)
    
    cap.release()
    return frames

def calculate_hashes(frames):
    """ Calculate perceptual hash for each frame. """
    return [imagehash.average_hash(Image.fromarray(frame)) for frame in frames]

def compare_hashes(query_hashes, candidate_hashes):
    """ Compare hashes from the query video to hashes from one candidate video using optimized numpy operations. """
    n = len(query_hashes)
    min_diff = float('inf')

    # Convert imagehash objects to NumPy arrays of the entire list
    query_hashes_np = np.stack([np.array(h.hash, dtype=int) for h in query_hashes])
    candidate_hashes_np = np.stack([np.array(h.hash, dtype=int) for h in candidate_hashes])

    # Calculate the windowed sum of differences over all possible subarrays of length n
    for offset in range(len(candidate_hashes) - n + 1):
        # Select the window segment of candidate hashes
        window = candidate_hashes_np[offset:offset + n]
        # Calculate the number of different bits (hamming distance) using XOR and sum
        current_diff = np.sum(query_hashes_np != window)
        if current_diff < min_diff:
            min_diff = current_diff

    return min_diff


def find_source_video(query_video_path):
    query_frames = extract_frames(query_video_path)
    query_hashes = calculate_hashes(query_frames)

    print(len(query_frames))
    print(len(query_hashes))

    json_file_path = 'perceptualHash.json'  # Specify your file path here

    with open(json_file_path, 'r') as file:
        precomputed_hashes = json.load(file)

    best_match = None
    smallest_diff = float('inf')

    # Compare the query hashes against each candidate video's hashes in the JSON file
    for video_file, candidate_hash_strings in precomputed_hashes.items():
        print("Checking " + video_file)
        candidate_hashes = [imagehash.hex_to_hash(h_str) for h_str in candidate_hash_strings]

        # Compare hashes to find the best match
        diff = compare_hashes(query_hashes, candidate_hashes)
        if diff < smallest_diff:
            smallest_diff = diff
            best_match = video_file

    return best_match



# videos_folder = '../../dataset/originals'
query_video_path = '../../dataset/Tests/video7_1_modified.mp4'
source_video = find_source_video(query_video_path)
print(f"The query video is most likely from: {source_video}")


600
600
Checking video1.mp4
Checking video10.mp4
Checking video11.mp4
Checking video12.mp4
Checking video13.mp4
Checking video14.mp4
Checking video15.mp4
Checking video16.mp4
Checking video17.mp4
Checking video18.mp4
Checking video19.mp4
Checking video2.mp4
Checking video20.mp4
Checking video3.mp4
Checking video4.mp4
Checking video5.mp4
Checking video6.mp4
Checking video7.mp4
Checking video8.mp4
Checking video9.mp4
The query video is most likely from: video7.mp4
