In [1]:
pip install transformers torchaudio ffmpeg-python moviepy pytesseract opencv-python numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install torchvision



In [None]:
# Check and install Tesseract OCR
import os
import subprocess

def check_install_tesseract():
    try:
        # Check if tesseract is installed
        subprocess.run(['tesseract', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print("Tesseract OCR is already installed")
    except FileNotFoundError:
        print("Tesseract OCR is not installed. Installing...")
        
        # Install for Linux
        try:
            # Check package manager
            if subprocess.run(['which', 'apt-get'], stdout=subprocess.PIPE, stderr=subprocess.PIPE).returncode == 0:
                # Debian/Ubuntu based
                os.system('sudo apt-get update && sudo apt-get install -y tesseract-ocr')
            elif subprocess.run(['which', 'yum'], stdout=subprocess.PIPE, stderr=subprocess.PIPE).returncode == 0:
                # RedHat/CentOS based
                os.system('sudo yum install -y tesseract')
            elif subprocess.run(['which', 'pacman'], stdout=subprocess.PIPE, stderr=subprocess.PIPE).returncode == 0:
                # Arch Linux based
                os.system('sudo pacman -S tesseract')
            else:
                print("Unsupported Linux distribution. Please install Tesseract OCR manually.")
        except Exception as e:
            print(f"Error installing Tesseract: {e}")
            print("Please install Tesseract OCR manually")

# Run the check and installation
check_install_tesseract()

In [2]:
import os
import cv2
import torch
import ffmpeg
import numpy as np
import pytesseract
import torchaudio
import moviepy
from transformers import pipeline
from moviepy.video.io import *



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pip install google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [None]:
import google.generativeai as genai

# Set up Gemini API key
#genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
genai.configure(api_key='')

In [None]:
# Load Hugging Face Speech-to-Text Model
#stt_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
stt_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo")


pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/Cellar/tesseract/5.5.0/bin/tesseract'

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


In [None]:
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import subprocess
import os
from tqdm import tqdm
import logging

def run_ffmpeg_command(command):
    """Execute ffmpeg command and handle its output"""
    try:
        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True
        )
        stdout, stderr = process.communicate()
        
        if process.returncode != 0:
            raise Exception(f"FFmpeg error: {stderr}")
        
        return True
    except Exception as e:
        print(f"Error running FFmpeg command: {str(e)}")
        return False

def split_video(video_path, output_prefix="chunk", num_chunks=4, output_dir=None):
    """
    Split a video into equal-length chunks.
    
    Args:
        video_path (str): Path to the input video file
        output_prefix (str): Prefix for output chunk files
        num_chunks (int): Number of chunks to split the video into
        output_dir (str): Directory to save chunks (defaults to same as input video)
    """
    try:
        # Input validation
        if not os.path.exists(video_path):
            raise FileNotFoundError(f"Video file not found: {video_path}")
            
        # Setup output directory
        if output_dir is None:
            output_dir = os.path.dirname(video_path)
        os.makedirs(output_dir, exist_ok=True)
        
        # Load the video
        print("Loading video...")
        video = VideoFileClip(video_path)
        total_duration = video.duration
        chunk_duration = math.ceil(total_duration / num_chunks)
        
        # Create chunks
        chunks_info = []
        start = 0
        
        print(f"\nSplitting video into {num_chunks} chunks...")
        for chunk_index in tqdm(range(1, num_chunks + 1)):
            try:
                end = min(start + chunk_duration, total_duration)
                chunk_filename = os.path.join(output_dir, f"{output_prefix}_{chunk_index}.mp4")
                
                # FFmpeg command for extracting subclip
                command = [
                    'ffmpeg',
                    '-i', video_path,
                    '-ss', str(start),
                    '-t', str(end - start),
                    '-c', 'copy',  # Copy codec to speed up processing
                    '-y',  # Overwrite output file if exists
                    chunk_filename
                ]
                
                if run_ffmpeg_command(command):
                    chunks_info.append({
                        'filename': os.path.basename(chunk_filename),
                        'start': start,
                        'end': end,
                        'duration': end - start
                    })
                
                start += chunk_duration
                
            except Exception as e:
                print(f"\nError processing chunk {chunk_index}: {str(e)}")
            continue

    finally:
        # Clean up
        if 'video' in locals():
            video.close()
            
    # Print summary
    print("\nProcessing complete!")
    print(f"Total chunks created: {len(chunks_info)}")
    for info in chunks_info:
        print(f"- {info['filename']}: {info['duration']:.2f} seconds")
    
    return chunks_info

if __name__ == "__main__":
    video_path = "lecture2.mp4"
    output_dir = "video_chunks"
    chunks = split_video(video_path, output_dir=output_dir)
    print(chunks)

In [6]:
from moviepy.video.io.VideoFileClip import VideoFileClip
import soundfile as sf



def extractAudio(videoPath, audio_path='extracted_audio.wav'):
    video = VideoFileClip(videoPath)
    video.audio.write_audiofile(audio_path, codec='pcm_s16le')
    return audio_path

def transcribeAudio(audio_path, model_size='base'):
    audio, sampling_rate = torchaudio.load(audio_path)
    transciption = stt_pipeline({'array': audio[0].numpy(), 'sampling_rate': sampling_rate})
    return transciption['text']

def extract_text_from_frames(videoPath, frame_interval=0):
    cap = cv2.VideoCapture(videoPath)
    frame_count = 0
    extracted_texts = []
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame_interval > 0 and frame_count % frame_interval == 0:  # Process every Nth frame
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            text = pytesseract.image_to_string(gray)
            if text.strip():
                extracted_texts.append(f"Frame {frame_count}: {text.strip()}")

        frame_count += 1

    cap.release()
    return "\n".join(extracted_texts)

def refineWithGemini(text):
    model = genai.GenerativeModel('gemini-2.0-pro-exp-02-05')

    prompt = f"""Here is a raw transcription of a video. Your task is to refine it into a well-structured, human-like summary with explanations while keeping all the original details:
    
    --- RAW TRANSCRIPTION ---
    {text}

    Please rewrite it clearly with explanations where needed, ensuring it's easy to read and understand.
    """

    response = model.generate_content([prompt])
    return response.text if response else "No refined output generated."

In [None]:
pip install pydub

In [None]:

def main(video_path):
    """Runs the full pipeline."""

    print("[1/5] Extracting audio...")
    audio_path = extractAudio(video_path)

    print("[2/5] Transcribing audio with Hugging Face model...")
    transcript = transcribeAudio(audio_path)
    print("\nAudio Transcript:\n", transcript[:500], "...")  # Show only first 500 characters

    print("[3/5] Extracting on-screen text from video frames...")
    visual_text = extract_text_from_frames(video_path)
    print("\nOn-Screen Text:\n", visual_text[:500], "...")  # Show only first 500 characters

    print("[4/4] Combining results...")
    full_content = f"--- AUDIO TRANSCRIPTION ---\n{transcript}\n\n--- ON-SCREEN TEXT ---\n{visual_text}"

    print("[4/5] Refining text with Google Gemini...")
    refined_text = refineWithGemini(transcript + "\n\n" + visual_text)
    #refined_text = refineWithGemini(transcript)
    
    print("[5/5] Saving results...")
    full_content = f"--- REFINED CONTENT ---\n{refined_text}"
    
    with open("video_content.txt", "w", encoding="utf-8") as f:
        f.write(full_content)
    
    print("\nContent saved to 'video_content.txt'")
    return full_content

if __name__ == "__main__":
    video_file = 'video_chunks/chunk_1.mp4'
    content = main(video_file)

[1/5] Extracting audio...
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf61.7.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1920, 1200], 'bitrate': 133, 'fps': 25.0, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'H.264/AVC video', 'vendor_id': '[0][0][0][0]', 'encoder': 'AVC Coding'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 32000, 'bitrate': 126, 'metadata': {'Metadata': '', 'handler_name': 'AAC audio', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 2, 'stream_type': 'data', 'language': 'eng', 'default': False, 'metadata': {'Metadata': '', 'handler_name': 'SubtitleHandler'}}], 'input_number': 0, 'chapters': [{'input_number': 0, 'chapter_number': 0, 'start'

                                                                        

MoviePy - Done.
[2/5] Transcribing audio with Hugging Face model...


RuntimeError: MPS backend out of memory (MPS allocated: 13.70 GB, other allocations: 8.41 MB, max allowed: 18.13 GB). Tried to allocate 5.83 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).