<a href="https://colab.research.google.com/github/TanmayAmte/FYP/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
pip install mediapipe


Collecting mediapipe
  Downloading mediapipe-0.10.21-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting numpy<2 (from mediapipe)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.3-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of jax to determine which version is compatible with other requirements. This could take a while.
Collecting jax (from mediapipe)
  Downloading jax-0.8.0-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib (from mediapipe)
  Downloading jaxlib-0.8.0-cp312-cp312-manylinux_2_27_x86_64.whl.metadata (1.3 kB)
Collecting jax (from mediapipe)
  Do

In [2]:
pip install -U openai-whisper

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m399.4/803.2 kB[0m [31m12.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=c1adc307577c5a5f092bd7c170eee8659ac3dc7a3ccfd123722524738612b641
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d1

In [3]:
import cv2
import moviepy.editor as mp
import librosa
import numpy as np
import mediapipe as mp_solutions
import whisper
import os

# Initialize MediaPipe solutions
mp_face_mesh = mp_solutions.solutions.face_mesh
mp_pose = mp_solutions.solutions.pose

class Preprocessor:
    def __init__(self, video_path):
        self.video_path = video_path
        self.audio_path = "temp_audio.wav"
        self.video_capture = cv2.VideoCapture(video_path)

        # Video valid check
        if not self.video_capture.isOpened():
            raise ValueError("Error opening video file")

    def extract_audio(self):
        print(f"Processing Audio from: {self.video_path}...")
        try:
            video_clip = mp.VideoFileClip(self.video_path)
            # Write audio to a temporary file (16-bit PCM WAV is standard for ML)

            video_clip.audio.write_audiofile(self.audio_path, verbose=False, logger=None)
            print("Audio extracted successfully.")
            return self.audio_path
        except Exception as e:
            print(f"Audio extraction failed: {e}")
            return None

    def load_audio_data(self):
        """
        Loads the extracted audio file into a numpy array for analysis.
        Returns:
            y: Audio time series
            sr: Sampling rate
        """
        if not os.path.exists(self.audio_path):
            print("Audio file not found. Run extract_audio() first.")
            return None, None

        # Librosa loads audio as a float array. sr=None preserves original sampling rate.
        y, sr = librosa.load(self.audio_path, sr=None)
        return y, sr

    def transcribe_audio(self, model_size="base"):
        """
        Uses OpenAI Whisper to transcribe audio with timestamps.
        Returns : dictionary containing the full text and word-level segments.
        """
        print(f"Transcribing audio using Whisper ({model_size} model)...")
        model = whisper.load_model(model_size)

        # Transcribe with word_timestamps=True for pace analysis later
        result = model.transcribe(self.audio_path, word_timestamps=True)

        print("Transcription complete.")
        return {
            "text": result["text"],
            "segments": result["segments"] # Contains start/end times for words
        }

    def process_video_frames(self, frame_skip=5):
        """
        Extracts visual landmarks from video frames.

        Args:
            frame_skip (int): Process every Nth frame to save computation time.

        Returns:
            List of dictionaries containing analysis data for processed frames.
        """
        print("Processing video frames for landmarks...")

        frame_data = []
        frame_count = 0

        # Setup MediaPipe instances
        with mp_face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            refine_landmarks=True,
            min_detection_confidence=0.5
        ) as face_mesh, \
        mp_pose.Pose(
            static_image_mode=False,
            min_detection_confidence=0.5
        ) as pose_detector:

            while True:
                ret, frame = self.video_capture.read()
                if not ret: #Video end
                    break

                # Skip frames to optimize speed
                if frame_count % frame_skip != 0:
                    frame_count += 1
                    continue

                # Convert BGR (OpenCV default) to RGB (MediaPipe requirement)
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                # 1. Detect Face Landmarks
                face_results = face_mesh.process(frame_rgb)

                # 2. Detect Body Pose
                pose_results = pose_detector.process(frame_rgb)

                # Store results if detections exist
                frame_info = {
                    "frame_index": frame_count,
                    "timestamp": self.video_capture.get(cv2.CAP_PROP_POS_MSEC) / 1000.0,
                    "face_landmarks": None,
                    "pose_landmarks": None
                }

                if face_results.multi_face_landmarks:
                    # We only take the first face detected
                    frame_info["face_landmarks"] = face_results.multi_face_landmarks[0]

                if pose_results.pose_landmarks:
                    frame_info["pose_landmarks"] = pose_results.pose_landmarks

                frame_data.append(frame_info)
                frame_count += 1

        self.video_capture.release()
        print(f"Video processing complete. Analyzed {len(frame_data)} frames.")
        return frame_data

    def cleanup(self):
        """Deletes temporary audio files."""
        if os.path.exists(self.audio_path):
            os.remove(self.audio_path)
            print("Temporary files cleaned up.")


if __name__ == "__main__":
    # Replace with your actual video file path
    VIDEO_FILE = "input_video.mp4"

    # Create a dummy video file if it doesn't exist (for testing purposes)
    if not os.path.exists(VIDEO_FILE):
        print(f"Please provide a valid video file at {VIDEO_FILE}")
    else:
        processor = Preprocessor(VIDEO_FILE)

        # 1. Extract Audio
        processor.extract_audio()

        # 2. Get Audio Data (for Librosa analysis later)
        y, sr = processor.load_audio_data()
        print(f"Audio Data Shape: {y.shape}, Sample Rate: {sr}")

        # 3. Get Transcript (Text Data)
        transcript_data = processor.transcribe_audio()
        print(f"Transcript: {transcript_data['text'][:100]}...") # Print first 100 chars

        # 4. Get Visual Landmarks (Video Data)
        visual_data = processor.process_video_frames(frame_skip=10) # Process every 10th frame
        print(f"First frame data keys: {visual_data[0].keys()}")

        # Cleanup
        processor.cleanup()

Please provide a valid video file at input_video.mp4
