In [8]:
import os
import shutil
import cv2
import numpy as np
import statistics

In [None]:
def get_video_hash(video_path, num_frames=10):
    """
    Generate a perceptual hash of the video by sampling frames.
    
    Args:
    video_path (str): Path to the video file
    num_frames (int): Number of frames to sample for hashing
    
    Returns:
    str: Perceptual hash of the video
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None
    
    # Get total number of frames
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Sample frames evenly across the video
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    
    # Collect sampled frames
    sampled_frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            # Convert to grayscale and resize
            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            resized_frame = cv2.resize(gray_frame, (8, 8), interpolation=cv2.INTER_AREA)
            
            # Compute the mean of the frame
            avg = np.mean(resized_frame)
            
            # Create binary fingerprint based on comparison to mean
            binary_frame = (resized_frame > avg).astype(int)
            
            sampled_frames.append(binary_frame.tobytes())
    
    cap.release()
    
    # Generate a hash based on sampled frames
    return hash(tuple(sampled_frames))

def find_unique_videos(videos_dir, performance_dir):
    # Collect all video files
    video_files = [f for f in os.listdir(videos_dir) if f.lower().endswith(('.mp4', '.avi', '.mov'))]
    
    # Dictionary to track unique videos
    unique_videos = {}
    
    # Compute hashes and collect performance scores
    for video_file in video_files:
        video_path = os.path.join(videos_dir, video_file)
        video_hash = get_video_hash(video_path)
        
        # Try to read performance score
        try:
            with open(os.path.join(performance_dir, video_file), 'r') as f:
                score = float(f.read().strip())
        except (FileNotFoundError, ValueError):
            print(f"No performance score found for {video_file}")
            continue
        
        # Track unique videos and their performances
        if video_hash not in unique_videos:
            unique_videos[video_hash] = {
                'files': [video_file],
                'performances': [score]
            }
        else:
            unique_videos[video_hash]['files'].append(video_file)
            unique_videos[video_hash]['performances'].append(score)
    
    return unique_videos

def process_unique_videos(base_path):
    # Define input and output directories
    videos_dir = os.path.join(base_path, 'videos')
    performance_dir = os.path.join(base_path, 'performance')
    
    # Create output directories
    unique_videos_dir = os.path.join(base_path, 'unique_videos')
    unique_performance_dir = os.path.join(base_path, 'unique_performances')
    os.makedirs(unique_videos_dir, exist_ok=True)
    os.makedirs(unique_performance_dir, exist_ok=True)
    
    # Find unique videos
    unique_videos = find_unique_videos(videos_dir, performance_dir)
    
    # Process unique videos
    for video_hash, video_data in unique_videos.items():
        # Choose first file as representative
        representative_file = video_data['files'][0]
        
        # Calculate average performance
        avg_performance = round(statistics.mean(video_data['performances']), 2)
        
        # Copy unique video to unique_videos directory
        src_video_path = os.path.join(videos_dir, representative_file)
        dst_video_path = os.path.join(unique_videos_dir, representative_file)
        shutil.copy2(src_video_path, dst_video_path)
        
        # Write average performance to unique_performances directory
        performance_path = os.path.join(unique_performance_dir, representative_file)
        with open(performance_path, 'w') as f:
            f.write(str(avg_performance))
        
        # Print information about unique video
        print(f"Unique Video: {representative_file}")
        print(f"  Duplicate files: {video_data['files']}")
        print(f"  Average Performance: {avg_performance}")
        print("---")

def main():
    # Use current directory as base path
    base_path = os.getcwd()
    process_unique_videos(base_path)
    print("Unique video processing complete.")

if __name__ == "__main__":
    main()

In [2]:
import os
import cv2
import numpy as np
import hashlib
from pathlib import Path
from collections import defaultdict

# Define paths
BASE_PATH = os.getcwd()
VIDEOS_FOLDER = BASE_PATH + "/videos"
PERFORMANCE_FOLDER = BASE_PATH + "/performance"
UNIQUE_VIDEOS_FOLDER = BASE_PATH + "/unique_videos"
UNIQUE_PERFORMANCE_FOLDER = BASE_PATH + "/unique_performances"

# Create output folders if they don't exist
os.makedirs(UNIQUE_VIDEOS_FOLDER, exist_ok=True)
os.makedirs(UNIQUE_PERFORMANCE_FOLDER, exist_ok=True)

# Helper function to compute hash for frames
def compute_video_hash(video_path):
    cap = cv2.VideoCapture(video_path)
    hasher = hashlib.md5()  # Hash function
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize frame for consistent hashing
        resized_frame = cv2.resize(frame, (256, 256), interpolation=cv2.INTER_AREA)
        hasher.update(resized_frame.tobytes())
    cap.release()
    return hasher.hexdigest()

# Map to store video hash and performance scores
video_hash_map = defaultdict(list)  # {hash: [(video_path, score)]}

# Process videos and performance files
for video_file in os.listdir(VIDEOS_FOLDER):
    video_path = os.path.join(VIDEOS_FOLDER, video_file)
    performance_file = os.path.join(PERFORMANCE_FOLDER, f"{Path(video_file).stem}.txt")
    
    # Skip if performance file doesn't exist
    if not os.path.exists(performance_file):
        print(f"Performance file missing for {video_file}. Skipping...")
        continue
    
    # Read performance score
    with open(performance_file, "r") as pf:
        score = float(pf.read().strip())
    
    # Compute hash for the video
    video_hash = compute_video_hash(video_path)
    
    # Add to hash map
    video_hash_map[video_hash].append((video_path, score))

# Handle unique videos and performance scores
for video_hash, videos in video_hash_map.items():
    # If multiple videos share the same hash
    if len(videos) > 1:
        print(f"Duplicate videos found: {[v[0] for v in videos]}")
        # Take the first video and average performance scores
        first_video, _ = videos[0]
        avg_score = np.mean([score for _, score in videos])
        
        # Copy the video to unique_videos
        unique_video_path = os.path.join(UNIQUE_VIDEOS_FOLDER, Path(first_video).name)
        if not os.path.exists(unique_video_path):
            os.rename(first_video, unique_video_path)
        
        # Save average performance score
        unique_performance_path = os.path.join(UNIQUE_PERFORMANCE_FOLDER, f"{Path(first_video).stem}.txt")
        with open(unique_performance_path, "w") as upf:
            upf.write(f"{avg_score:.2f}")
    else:
        # Single video is unique
        video_path, score = videos[0]
        
        # Copy the video to unique_videos
        unique_video_path = os.path.join(UNIQUE_VIDEOS_FOLDER, Path(video_path).name)
        if not os.path.exists(unique_video_path):
            os.rename(video_path, unique_video_path)
        
        # Save performance score
        unique_performance_path = os.path.join(UNIQUE_PERFORMANCE_FOLDER, f"{Path(video_path).stem}.txt")
        with open(unique_performance_path, "w") as upf:
            upf.write(f"{score:.2f}")


Duplicate videos found: ['/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-906909830682725', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-924585008982148', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-891661008828094', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-689363059404444', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-987232088920289', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-999607261342550', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-872222594086011', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-612555930

In [2]:
import os
import cv2
import numpy as np
import hashlib
from pathlib import Path
from collections import defaultdict
import shutil  # For copying files

# Define paths
BASE_PATH = os.getcwd()
VIDEOS_FOLDER = BASE_PATH + "/videos"
PERFORMANCE_FOLDER = BASE_PATH + "/performance"
UNIQUE_VIDEOS_FOLDER = BASE_PATH + "/unique_videos"
UNIQUE_PERFORMANCE_FOLDER = BASE_PATH + "/unique_performances"

# Create output folders if they don't exist
os.makedirs(UNIQUE_VIDEOS_FOLDER, exist_ok=True)
os.makedirs(UNIQUE_PERFORMANCE_FOLDER, exist_ok=True)

# Helper function to compute hash for frames
def compute_video_hash(video_path):
    cap = cv2.VideoCapture(video_path)
    hasher = hashlib.md5()  # Hash function
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize frame for consistent hashing
        resized_frame = cv2.resize(frame, (256, 256), interpolation=cv2.INTER_AREA)
        hasher.update(resized_frame.tobytes())
    cap.release()
    return hasher.hexdigest()

# Map to store video hash and performance scores
video_hash_map = defaultdict(list)  # {hash: [(video_path, score)]}

# Process videos and performance files
for video_file in os.listdir(VIDEOS_FOLDER):
    video_path = os.path.join(VIDEOS_FOLDER, video_file)
    performance_file = os.path.join(PERFORMANCE_FOLDER, f"{Path(video_file).stem}.txt")
    
    # Skip if performance file doesn't exist
    if not os.path.exists(performance_file):
        print(f"Performance file missing for {video_file}. Skipping...")
        continue
    
    # Read performance score
    with open(performance_file, "r") as pf:
        score = float(pf.read().strip())
    
    # Compute hash for the video
    video_hash = compute_video_hash(video_path)
    
    # Add to hash map
    video_hash_map[video_hash].append((video_path, score))

# Handle unique videos and performance scores
for video_hash, videos in video_hash_map.items():
    # If multiple videos share the same hash
    if len(videos) > 1:
        print(f"Duplicate videos found: {[v[0] for v in videos]}")
        # Take the first video and average performance scores
        first_video, _ = videos[0]
        avg_score = np.mean([score for _, score in videos])
        
        # Copy the video to unique_videos
        unique_video_path = os.path.join(UNIQUE_VIDEOS_FOLDER, Path(first_video).name)
        shutil.copy2(first_video, unique_video_path)
        
        # Save average performance score
        unique_performance_path = os.path.join(UNIQUE_PERFORMANCE_FOLDER, f"{Path(first_video).stem}.txt")
        with open(unique_performance_path, "w") as upf:
            upf.write(f"{avg_score:.2f}")
    else:
        # Single video is unique
        video_path, score = videos[0]
        
        # Copy the video to unique_videos
        unique_video_path = os.path.join(UNIQUE_VIDEOS_FOLDER, Path(video_path).name)
        shutil.copy2(video_path, unique_video_path)
        
        # Save performance score
        unique_performance_path = os.path.join(UNIQUE_PERFORMANCE_FOLDER, f"{Path(video_path).stem}.txt")
        with open(unique_performance_path, "w") as upf:
            upf.write(f"{score:.2f}")


Duplicate videos found: ['/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-954832972830686', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-797024995612413']
Duplicate videos found: ['/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-791572172961239', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-424802420355057', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-8008073409211713', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-803990118540094']
Duplicate videos found: ['/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer-Face-Metrics/Jupyter_notebooks/videos/hd-686083626588321', '/home/vishist/coding/Startup_Assignments/FuelGrowth/Influencer