<a href="https://colab.research.google.com/github/Sibikrish3000/video-mining/blob/master/compress_videos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#@markdown <br><center><img src='https://upload.wikimedia.org/wikipedia/commons/thumb/d/da/Google_Drive_logo.png/600px-Google_Drive_logo.png' height="50" alt="Gdrive-logo"/></center>
#@markdown <center><h3>Mount Gdrive to /content/drive</h3></center><br>
MODE = "MOUNT" #@param ["MOUNT", "UNMOUNT"]
#Mount your Gdrive!
from google.colab import drive
drive.mount._DEBUG = False
if MODE == "MOUNT":
  drive.mount('/content/drive', force_remount=True)
elif MODE == "UNMOUNT":
  try:
    drive.flush_and_unmount()
  except ValueError:
    pass
  get_ipython().system_raw("rm -rf /root/.config/Google/DriveFS")

Mounted at /content/drive


In [15]:
#@title 2. Configure Your Video Folder and Compression Settings
import os
from pathlib import Path

#@markdown ---
#@markdown ### 📁 **Target Folder Path**
#@markdown Enter the path to the folder inside your Google Drive that contains the videos.
DRIVE_FOLDER_PATH = "My Drive/PENDING_REVIEW_FOLDER_5" #@param {type:"string"}

#@markdown ---
#@markdown ### ⚙️ **Compression Settings**
#@markdown Set the target file size for your compressed videos.
TARGET_SIZE_MB = 5 #@param {type:"number"}

#@markdown ---
#@markdown ### 🚀 **Performance Settings**
#@markdown `MAX_WORKERS` controls how many videos are processed at the same time. A good starting point is 2 times the number of CPU cores.
MAX_WORKERS = 8 #@param {type:"slider", min:1, max:16, step:1}


# --- Don't edit below this line ---
if "My Drive" in DRIVE_FOLDER_PATH:
    base_gdrive_path = "/content/drive/My Drive/"
    relative_path = DRIVE_FOLDER_PATH.split("My Drive/", 1)[1]
    TARGET_DIR = Path(base_gdrive_path) / relative_path
else:
    TARGET_DIR = Path("/content/drive") / DRIVE_FOLDER_PATH

FILE_SIZE_THRESHOLD = TARGET_SIZE_MB * 1024 * 1024
TARGET_SIZE_BUFFER_MB = TARGET_SIZE_MB * 0.98

print(f"✅ Configuration Loaded:")
print(f"   - Target Directory: {TARGET_DIR}")
if not TARGET_DIR.is_dir():
    print(f"   - ❌ ERROR: The specified directory does not exist. Please check the path.")
else:
    print(f"   - ✅ Directory found.")
print(f"   - Compressing files larger than: {TARGET_SIZE_MB} MB")
print(f"   - Parallel Workers: {MAX_WORKERS}")

✅ Configuration Loaded:
   - Target Directory: /content/drive/My Drive/PENDING_REVIEW_FOLDER_5
   - ✅ Directory found.
   - Compressing files larger than: 5 MB
   - Parallel Workers: 8


In [3]:
#@title 3. Run the Batch Compressor (GPU ACCELERATED - MAX PERFORMANCE)
#@title This version offloads encoding to the GPU for a massive speed increase.
#@markdown **IMPORTANT:** Ensure you have enabled the T4 GPU via "Runtime" -> "Change runtime type".
import subprocess
import cv2
import glob
from pathlib import Path
import shutil
import os
import concurrent.futures

def get_video_duration(video_path: Path) -> float:
    # This helper function is unchanged
    try:
        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened(): return 0.0
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = frame_count / fps if fps > 0 else 0
        cap.release()
        return duration
    except Exception:
        return 0.0

def compress_video_gpu(video_path_on_drive: Path):
    """
    This function processes a SINGLE video file using the GPU's hardware encoder.
    """
    thread_name = f"Worker for {video_path_on_drive.name}"
    print(f"[{thread_name}] Starting GPU job.")

    local_video_path = None
    local_output_path = None
    log_file_prefix = None
    try:
        # STEP 1: Copy to local storage (still important for I/O speed)
        local_video_path = Path(f"/content/{video_path_on_drive.name}")
        shutil.copyfile(video_path_on_drive, local_video_path)

        duration = get_video_duration(local_video_path)
        if duration <= 0:
            print(f"[{thread_name}] ⚠️  Skipping: Could not determine duration.")
            return

        target_bits = TARGET_SIZE_BUFFER_MB * 1024 * 1024 * 8
        target_bitrate = int(target_bits / duration)

        local_output_path = Path(f"/content/{local_video_path.stem}_compressed.mp4")
        log_file_prefix = f"/content/{local_video_path.stem}_ffmpeg_log"

        # STEP 2: Process locally using the GPU hardware encoder (h264_nvenc)
        try:
            # === THE KEY CHANGE: USE h264_nvenc and GPU-specific presets ===
            encoder = 'h264_nvenc'
            # NVENC presets are different. 'p1' is fastest, 'p7' is slowest/best quality. 'p2' or 'p3' is great.
            preset = 'p2'

            print(f"   - [{thread_name}] Starting GPU Pass 1 (encoder: {encoder}, preset: {preset})...")
            pass1_command = [
                'ffmpeg', '-y', '-i', str(local_video_path),
                '-c:v', encoder, '-b:v', str(target_bitrate),
                '-pass', '1', '-preset', preset, '-an', '-f', 'mp4',
                '-passlogfile', log_file_prefix,
                os.devnull
            ]
            result1 = subprocess.run(pass1_command, capture_output=True, text=True, check=False)
            if result1.returncode != 0:
                print(f"   - [{thread_name}] ❌ FAIL: FFmpeg GPU Pass 1 failed. Error: {result1.stderr}")
                return

            print(f"   - [{thread_name}] Starting GPU Pass 2...")
            pass2_command = [
                'ffmpeg', '-i', str(local_video_path),
                '-c:v', encoder, '-b:v', str(target_bitrate),
                '-pass', '2', '-preset', preset, '-an',
                '-passlogfile', log_file_prefix,
                str(local_output_path)
            ]
            result2 = subprocess.run(pass2_command, capture_output=True, text=True, check=False)
            if result2.returncode != 0:
                print(f"   - [{thread_name}] ❌ FAIL: FFmpeg GPU Pass 2 failed. Error: {result2.stderr}")
                return
        finally:
            if log_file_prefix:
                for log_file in glob.glob(f"{log_file_prefix}*"):
                    os.remove(log_file)

        # STEP 3: Replace the file on Google Drive
        if local_output_path.exists():
            compressed_size = local_output_path.stat().st_size
            shutil.move(local_output_path, video_path_on_drive)
            print(f"[{thread_name}] ✅ SUCCESS: New size is {compressed_size / 1024**2:.2f} MB. Original replaced.")
        else:
            print(f"[{thread_name}] ❌ FAIL: Compressed file was not created.")

    except Exception as e:
        print(f"[{thread_name}] ❌ An unexpected error occurred: {e}")
    finally:
        # STEP 4: Cleanup local files
        if local_video_path and local_video_path.exists(): local_video_path.unlink()
        if local_output_path and local_output_path.exists(): local_output_path.unlink()


In [4]:
# --- Main execution logic with GPU Check ---
def run_batch_compression():
    # === NEW: CHECK FOR GPU BEFORE STARTING ===
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, check=True)
        print("✅ NVIDIA GPU Detected. Hardware acceleration will be used.")
        print(result.stdout.split('\n')[8]) # Print a line from nvidia-smi to show the GPU details
    except (FileNotFoundError, subprocess.CalledProcessError):
        print("❌ NVIDIA GPU NOT FOUND! ❌")
        print("Please enable a GPU runtime via 'Runtime' -> 'Change runtime type' -> 'T4 GPU' and run this cell again.")
        return

    if not TARGET_DIR.is_dir():
        print(f"ERROR: The target directory '{TARGET_DIR}' was not found. Please check your path in Cell 2.")
        return

    print(f"\n🚀 Starting batch compression in '{TARGET_DIR}' using {MAX_WORKERS} parallel workers...")
    print("-" * 50)
    video_extensions = ["*.mp4", "*.mov", "*.mkv", "*.avi", "*.webm"]
    video_files_to_process = []
    print("Scanning for large files...")
    for ext in video_extensions:
        for video_path in TARGET_DIR.rglob(ext):
            try:
                if video_path.stat().st_size > FILE_SIZE_THRESHOLD:
                    video_files_to_process.append(video_path)
            except Exception as e:
                print(f"Could not stat file {video_path.name}. Error: {e}")
    if not video_files_to_process:
        print("No video files larger than the threshold were found.")
        return
    print(f"Found {len(video_files_to_process)} videos to compress. Starting parallel GPU processing...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # We now map to the new GPU-specific function
        list(executor.map(compress_video_gpu, video_files_to_process))

    print("-" * 50)
    print(f"🎉 Batch compression complete! All GPU jobs have been processed.")

In [16]:
# Run the main function
run_batch_compression()

✅ NVIDIA GPU Detected. Hardware acceleration will be used.
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |

🚀 Starting batch compression in '/content/drive/My Drive/PENDING_REVIEW_FOLDER_5' using 8 parallel workers...
--------------------------------------------------
Scanning for large files...
No video files larger than the threshold were found.
