<a href="https://colab.research.google.com/github/Sonjoy209164/1907073--KUET-Management-System/blob/main/vedio_segmentation_only_baised_on_vedio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import cv2
import moviepy.editor as mp
import os

# Function to detect a single valid face on screen
def is_single_face_on_screen(frame, face_cascade, min_face_size=100):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    valid_faces = []
    for (x, y, w, h) in faces:
        if w > min_face_size and h > min_face_size:
            valid_faces.append((x, y, w, h))

    return len(valid_faces) == 1  # Return True if exactly one face is detected

# Function to extract segments when only one person is detected
def extract_segments_based_on_faces(video_path, output_dir, face_cascade, min_face_size=100, frame_skip=5, fps=30):
    # Open the video
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Prepare the output directory
    os.makedirs(output_dir, exist_ok=True)
    print(f"Output directory: {output_dir} (created or already exists)")

    frame_number = 0
    current_segment = []
    segments = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print(f"End of video reached at frame {frame_number}.")
            break

        # Process every Nth frame for efficiency
        if frame_number % frame_skip == 0:
            start_time = frame_number / video_fps

            # Check for single face detection
            single_face_detected = is_single_face_on_screen(frame, face_cascade, min_face_size)

            # Debug: Print results of face detection
            print(f"Processing frame {frame_number}, Single face detected: {single_face_detected}")

            # If one face is detected, continue segment
            if single_face_detected:
                current_segment.append(frame_number)
            else:
                # If multiple or no faces are detected, save the segment and start a new one
                if current_segment:
                    print(f"Segment detected from frame {current_segment[0]} to {current_segment[-1]}")
                    segments.append(current_segment)
                    current_segment = []

        frame_number += 1

    cap.release()

    # If a segment is still ongoing, append it
    if current_segment:
        print(f"Final segment detected from frame {current_segment[0]} to {current_segment[-1]}")
        segments.append(current_segment)

    print(f"Total segments detected: {len(segments)}")
    return segments


  if event.key is 'enter':



In [7]:
def save_segments(video_path, segments, output_dir, fps=30):
    video_fps = int(cv2.VideoCapture(video_path).get(cv2.CAP_PROP_FPS))

    for i, segment in enumerate(segments):
        start_frame = segment[0]
        end_frame = segment[-1]
        start_time = start_frame / video_fps
        end_time = end_frame / video_fps
        duration = end_time - start_time

        # Check if the segment has a non-zero duration and is longer than 3 seconds
        if duration < 3:
            print(f"Skipping segment {i+1} as its duration is less than 3 seconds.")
            continue  # Skip segments with duration less than 3 seconds

        # Debug: Print details of the segment being saved
        print(f"Saving segment {i+1} from {start_time:.2f}s to {end_time:.2f}s")

        # Save the segment
        video_clip = mp.VideoFileClip(video_path).subclip(start_time, end_time)
        output_path = os.path.join(output_dir, f"segment_{i+1}.mp4")
        video_clip.write_videofile(output_path, fps=fps)

        print(f"Segment {i+1} saved to {output_path}")

    print("Segmentation process completed.")


In [9]:
# Path to the Haar Cascade file for face detection
face_cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
face_cascade = cv2.CascadeClassifier(face_cascade_path)

# Path to the video
video_path = "/content/NTV_Short_Videos/রমজানের জন্য ছোলা,খেজুর,তেলসহ নিত্য পণ্য আমদানির সিদ্ধান্ত  #ntv #viralvideo #shots #shortvideo.mp4"

# Output directory for saving segments
output_dir = "./segments"

# Extract segments based on single face detection
segments = extract_segments_based_on_faces(video_path, output_dir, face_cascade)

# Save the detected segments into separate video files
save_segments(video_path, segments, output_dir, fps=30)


Output directory: ./segments (created or already exists)
Processing frame 0, Single face detected: True
Processing frame 5, Single face detected: False
Segment detected from frame 0 to 0
Processing frame 10, Single face detected: True
Processing frame 15, Single face detected: True
Processing frame 20, Single face detected: True
Processing frame 25, Single face detected: False
Segment detected from frame 10 to 20
Processing frame 30, Single face detected: False
Processing frame 35, Single face detected: True
Processing frame 40, Single face detected: True
Processing frame 45, Single face detected: True
Processing frame 50, Single face detected: True
Processing frame 55, Single face detected: True
Processing frame 60, Single face detected: False
Segment detected from frame 35 to 55
Processing frame 65, Single face detected: False
Processing frame 70, Single face detected: True
Processing frame 75, Single face detected: True
Processing frame 80, Single face detected: True
Processing fram



MoviePy - Done.
Moviepy - Writing video ./segments/segment_8.mp4





Moviepy - Done !
Moviepy - video ready ./segments/segment_8.mp4
Segment 8 saved to ./segments/segment_8.mp4
Skipping segment 9 as its duration is less than 3 seconds.
Skipping segment 10 as its duration is less than 3 seconds.
Skipping segment 11 as its duration is less than 3 seconds.
Skipping segment 12 as its duration is less than 3 seconds.
Skipping segment 13 as its duration is less than 3 seconds.
Skipping segment 14 as its duration is less than 3 seconds.
Skipping segment 15 as its duration is less than 3 seconds.
Saving segment 16 from 21.20s to 25.60s
Moviepy - Building video ./segments/segment_16.mp4.
MoviePy - Writing audio in segment_16TEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video ./segments/segment_16.mp4





Moviepy - Done !
Moviepy - video ready ./segments/segment_16.mp4
Segment 16 saved to ./segments/segment_16.mp4
Skipping segment 17 as its duration is less than 3 seconds.
Skipping segment 18 as its duration is less than 3 seconds.
Skipping segment 19 as its duration is less than 3 seconds.
Skipping segment 20 as its duration is less than 3 seconds.
Skipping segment 21 as its duration is less than 3 seconds.
Skipping segment 22 as its duration is less than 3 seconds.
Skipping segment 23 as its duration is less than 3 seconds.
Skipping segment 24 as its duration is less than 3 seconds.
Skipping segment 25 as its duration is less than 3 seconds.
Skipping segment 26 as its duration is less than 3 seconds.
Skipping segment 27 as its duration is less than 3 seconds.
Skipping segment 28 as its duration is less than 3 seconds.
Skipping segment 29 as its duration is less than 3 seconds.
Skipping segment 30 as its duration is less than 3 seconds.
Skipping segment 31 as its duration is less than 

Temp

In [None]:
# Install required packages
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev
!pip install pytesseract moviepy opencv-python-headless

# Import necessary libraries
import cv2
import pytesseract
import moviepy.editor as mp
from google.colab import files
import numpy as np
import os

# Download the Bangla language model for Tesseract
!apt-get install tesseract-ocr-beng
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Step 1: Upload video
print("Please upload your video file")
uploaded = files.upload()

# Step 2: Load the video
video_file = list(uploaded.keys())[0]  # Get the uploaded file name
video = mp.VideoFileClip(video_file)

# Step 3: Define the function to extract subtitles from frames
def extract_bangla_subtitles_from_frame(frame, lang='ben'):
    """
    Extracts Bangla text from a given frame using Tesseract OCR.
    :param frame: Video frame (image) to process
    :param lang: Language for OCR (default is Bangla 'ben')
    :return: Extracted text (if any)
    """
    # Convert frame to grayscale (better for OCR)
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to highlight the text
    _, thresh_frame = cv2.threshold(gray_frame, 150, 255, cv2.THRESH_BINARY_INV)

    # OCR processing on the frame
    text = pytesseract.image_to_string(thresh_frame, lang=lang)

    # Return the extracted text
    return text.strip()

# Step 4: Define function to process the video and extract subtitles
def process_video_for_subtitles(video, interval=1):
    """
    Extracts subtitles from a video at regular intervals using OCR.
    :param video: VideoClip object to process
    :param interval: Interval in seconds between frame extraction
    :return: List of subtitles and their corresponding timestamps
    """
    subtitles = []

    # Iterate through the video by extracting frames at regular intervals
    for t in np.arange(0, video.duration, interval):
        # Get the frame at time `t`
        frame = video.get_frame(t)

        # Extract the subtitle (if any) from the frame
        subtitle_text = extract_bangla_subtitles_from_frame(frame)

        # If text is found, add it to the subtitles list
        if subtitle_text:
            subtitles.append((t, subtitle_text))
            print(f"At {t:.2f} seconds: {subtitle_text}")

    return subtitles

# Step 5: Extract subtitles from the video
subtitles = process_video_for_subtitles(video, interval=1)  # Change interval for more precision

# Step 6: Save the subtitles to a text file
with open("extracted_subtitles.txt", "w", encoding="utf-8") as f:
    for timestamp, subtitle in subtitles:
        f.write(f"{timestamp:.2f}s: {subtitle}\n")

print("Subtitles have been extracted and saved to 'extracted_subtitles.txt'.")
files.download("extracted_subtitles.txt")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,132 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

Saving segment1.mp4 to segment1.mp4


TesseractError: (1, 'Error opening data file /usr/share/tesseract-ocr/4.00/tessdata/ben.traineddata Please make sure the TESSDATA_PREFIX environment variable is set to your "tessdata" directory. Failed loading language \'ben\' Tesseract couldn\'t load any languages! Could not initialize tesseract.')

In [1]:
pip install yt-dlp


Collecting yt-dlp
  Downloading yt_dlp-2024.11.4-py3-none-any.whl.metadata (172 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/172.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.1/172.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2024.11.4-py3-none-any.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m143.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2024.11.4


# All youtube video download


In [3]:
import os
import subprocess

# Define the channel URL and target folder
channel_url = "https://www.youtube.com/@NTVlatestnews/shorts"
output_folder = "NTV_Short_Videos"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Set yt-dlp command to download the first 10 videos in mp4 format
command = [
    "yt-dlp",
    f"{channel_url}",
    "--match-filters", "duration <= 60",
    "--max-downloads", "4",
    "-f", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4",
    "--merge-output-format", "mp4",
    "-o", f"{output_folder}/%(title)s.%(ext)s",
]

# Execute the command
subprocess.run(command)

print(f"Downloaded the first 10 videos to the folder '{output_folder}'.")


Downloaded the first 10 videos to the folder 'NTV_Short_Videos'.


In [4]:
import os
import subprocess

# Define the channel URL and target folder
channel_url = "https://www.youtube.com/@NTVlatestnews/shorts"
output_folder = "NTV_Short_Videos"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Set yt-dlp command to download the first 10 videos with subtitles and merge video/audio
command = [
    "yt-dlp",
    f"{channel_url}",
    "--match-filters", "duration <= 60",
    "--max-downloads", "3",
    "-f", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4",  # Best video and audio quality
    "--merge-output-format", "mp4",  # Merge into a single mp4 file
    "--write-subs",  # Download subtitles
    "--sub-lang", "en",  # Download English subtitles (or other language if needed)
    "--embed-subs",  # Embed subtitles into the video
    "-o", f"{output_folder}/%(title)s.%(ext)s",  # Output format
]

# Execute the command
subprocess.run(command)

print(f"Downloaded the first 10 videos with subtitles to the folder '{output_folder}'.")


Downloaded the first 10 videos with subtitles to the folder 'NTV_Short_Videos'.


# Subtitle generation

In [2]:
pip install openai-whisper transformers ffmpeg-python


Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/800.5 kB[0m [31m25.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.wh

In [3]:
!pip install textgrid # Install the textgrid package

Collecting textgrid
  Downloading TextGrid-1.6.1.tar.gz (9.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: textgrid
  Building wheel for textgrid (setup.py) ... [?25l[?25hdone
  Created wheel for textgrid: filename=TextGrid-1.6.1-py3-none-any.whl size=10147 sha256=6433d568f49f3735ed1c0e5c2a020dc99b61058d856ef6921d60291a50208322
  Stored in directory: /root/.cache/pip/wheels/23/41/f2/e2ef1817bd163de3c21dd078966bdd71bd5c4455841f4ec016
Successfully built textgrid
Installing collected packages: textgrid
Successfully installed textgrid-1.6.1


In [8]:
!apt-get install -y sox
!git clone https://github.com/lowerquality/gentle.git
%cd gentle
!pip install -r requirements.txt


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libwavpack1
Suggested packages:
  libsox-fmt-all
The following NEW packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libwavpack1 sox
0 upgraded, 7 newly installed, 0 to remove and 49 not upgraded.
Need to get 617 kB of archives.
After this operation, 1,764 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopencore-amrnb0 amd64 0.1.5-1 [94.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopencore-amrwb0 amd64 0.1.5-1 [49.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox3 amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [240 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libs

In [10]:
!pip install twisted


Collecting twisted
  Downloading twisted-24.10.0-py3-none-any.whl.metadata (20 kB)
Collecting automat>=24.8.0 (from twisted)
  Downloading Automat-24.8.1-py3-none-any.whl.metadata (8.4 kB)
Collecting constantly>=15.1 (from twisted)
  Downloading constantly-23.10.4-py3-none-any.whl.metadata (1.8 kB)
Collecting hyperlink>=17.1.1 (from twisted)
  Downloading hyperlink-21.0.0-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting incremental>=24.7.0 (from twisted)
  Downloading incremental-24.7.2-py3-none-any.whl.metadata (8.1 kB)
Collecting zope-interface>=5 (from twisted)
  Downloading zope.interface-7.1.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading twisted-24.10.0-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m52.4 MB/s[0m eta [36

In [11]:
!python3 serve.py &


INFO:root:gentle 0.11.0
INFO:root:listening at 0.0.0.0:8765

INFO:root:SERVE 8765, 0.0.0.0, 1
Traceback (most recent call last):
  File "/content/gentle/serve.py", line 274, in <module>
    serve(args.port, args.host, nthreads=args.nthreads, ntranscriptionthreads=args.ntranscriptionthreads, installSignalHandlers=1)
  File "/content/gentle/serve.py", line 235, in serve
    trans = Transcriber(data_dir, nthreads=nthreads, ntranscriptionthreads=ntranscriptionthreads)
  File "/content/gentle/serve.py", line 34, in __init__
    self.resources = gentle.Resources()
  File "/content/gentle/gentle/resources.py", line 19, in __init__
    require_dir(self.proto_langdir)
  File "/content/gentle/gentle/resources.py", line 16, in require_dir
    raise RuntimeError("No resource directory %s.  Check %s environment variable?" % (path, ENV_VAR))
RuntimeError: No resource directory /content/gentle/exp.  Check GENTLE_RESOURCES_ROOT environment variable?


In [17]:
import ffmpeg

def extract_audio(video_path, audio_path="audio.wav"):
    ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True)
    return audio_path

# Replace 'path/to/your/video.mp4' with the actual path of your video
video_path = "/content/segment_12.mp4"
audio_path = extract_audio(video_path)


In [18]:
import whisper

def transcribe_audio_whisper(audio_path):
    model = whisper.load_model("large")  # Use Whisper large for high accuracy
    result = model.transcribe(audio_path, language="bn")  # Set language to Bengali
    return result["text"]

transcription_text = transcribe_audio_whisper(audio_path)
print("Transcription:", transcription_text)


  checkpoint = torch.load(fp, map_location=device)


Transcription:  চিন ও ভারতের প্রকল্প বাস্তো বাযন হলে মঙ্লা বন্দরের চেহারা আগামি দুই বচ্ছরের মধ্যে পরিবর্তিত হাবে বলে জানান না পরিবহন মন্ত্রনলের উপদেশটা অবশ্রপ্ত ব্রিগেডিয়ার জেনোরেল ডক্টর এম শাখাবাত হসেন নিই চাইনাস সাথে চুক্তি হলে একাজ দুই বছ্যরের মধ্যে শেশ হাবে ফলে চট্রগ্রামের উপর চাপ কম্বে জিতু জি চাইনেস জে প্রপল্পো সেই প্রপল্পে এখানে কন্টেনের যাড তোরি হবে আমাং কন্টেনের স্বার্শ করে এখানে আস্থে ব


In [15]:
import requests
import json

def align_audio_gentle(audio_path, transcription_text):
    # Define the Gentle server URL
    gentle_url = "http://localhost:8765/transcriptions?async=false"

    # Prepare files for upload
    files = {
        'audio': open(audio_path, 'rb'),
        'transcript': (None, transcription_text)
    }

    # Send POST request to Gentle server
    response = requests.post(gentle_url, files=files)
    alignment_result = response.json()
    return alignment_result

alignment_data = align_audio_gentle(audio_path, transcription_text)



ConnectionError: HTTPConnectionPool(host='localhost', port=8765): Max retries exceeded with url: /transcriptions?async=false (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fef2dd852d0>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [16]:
def save_to_srt(alignment_data, output_path="subtitles.srt"):
    with open(output_path, "w", encoding="utf-8") as srt_file:
        index = 1
        for word_data in alignment_data['words']:
            if word_data['case'] == 'success':  # Only include successfully aligned words
                start = word_data['start']
                end = word_data['end']
                word = word_data['alignedWord']

                # Format start and end times in SRT format (hh:mm:ss,ms)
                start_time = f"{int(start // 3600):02}:{int((start % 3600) // 60):02}:{int(start % 60):02},{int((start % 1) * 1000):03}"
                end_time = f"{int(end // 3600):02}:{int((end % 3600) // 60):02}:{int(end % 60):02},{int((end % 1) * 1000):03}"

                srt_file.write(f"{index}\n{start_time} --> {end_time}\n{word}\n\n")
                index += 1

save_to_srt(alignment_data)
print("Subtitle file 'subtitles.srt' has been created.")


NameError: name 'alignment_data' is not defined

In [24]:
!apt-get install -y ffmpeg espeak
!pip install aeneas

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
espeak is already the newest version (1.48.15+dfsg-3).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Collecting aeneas
  Using cached aeneas-1.7.3.0.tar.gz (5.5 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: aeneas
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for aeneas (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for aeneas[0m[31m
[0m[?25h  Running setup.py clean for aeneas
Failed to build aeneas
[31mERROR: ERROR: Failed to build installable wheels for some p

In [21]:
import ffmpeg

def extract_audio(video_path, audio_path="audio.wav"):
    ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True)
    return audio_path

# Specify your video file path
video_path = "/content/segment_12.mp4"
audio_path = extract_audio(video_path)


In [22]:
import whisper

def transcribe_audio_whisper(audio_path):
    model = whisper.load_model("large")  # Use large model for high accuracy
    result = model.transcribe(audio_path, language="bn")  # Set language to Bengali
    return result["text"]

transcription_text = transcribe_audio_whisper(audio_path)
print("Transcription:", transcription_text)


  checkpoint = torch.load(fp, map_location=device)


Transcription:  সাকালে বন্দরের VTMIS বন্দর জেটি


In [23]:
from aeneas.executetask import ExecuteTask
from aeneas.task import Task

def align_audio_aeneas(audio_path, transcription_text, output_path="aligned.srt"):
    # Create a temporary text file for the transcription
    with open("transcription.txt", "w", encoding="utf-8") as f:
        f.write(transcription_text)

    # Configure the alignment task
    task = Task(config_string="task_language=ben|is_text_type=plain|os_task_file_format=srt")
    task.audio_file_path_absolute = audio_path
    task.text_file_path_absolute = "transcription.txt"
    task.output_file_path_absolute = output_path

    # Execute the task
    ExecuteTask(task).execute()
    task.output_sync_map_file()
    print(f"Alignment saved to {output_path}")

align_audio_aeneas(audio_path, transcription_text)


ModuleNotFoundError: No module named 'aeneas'

In [25]:
!pip install -q whisper openai-whisper
!sudo apt update && sudo apt install -y ffmpeg

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [59.5 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,107 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get

In [26]:

import whisper

# Step 2: Load the Whisper model
model = whisper.load_model("large")  # 'small' model should handle Bangla well. You can choose 'base' or 'large' for more accuracy but higher computation.

# Step 3: Transcribe the video with timestamps
transcription = model.transcribe("/content/segment_12.mp4", language="bn")

# Step 4: Print the transcription with timestamps
for segment in transcription['segments']:
    print(f"Start: {segment['start']:.2f}s, End: {segment['end']:.2f}s, Text: {segment['text']}")

  checkpoint = torch.load(fp, map_location=device)


Start: 0.00s, End: 3.54s, Text:  সাকালে বন্দরের VTMIS বন্দর জেটি


In [28]:
# Step 1: Install whisper and ffmpeg
!pip install -q whisper openai-whisper
!sudo apt update && sudo apt install -y ffmpeg

import whisper

# Step 2: Load the Whisper model
model = whisper.load_model("large")

# Step 3: Transcribe the video with word-level timestamps
transcription = model.transcribe("/content/segment_12.mp4", language="bn", word_timestamps=True)

# Step 4: Display the transcription with word-level timestamps
for segment in transcription['segments']:
    print(f"Segment Start: {segment['start']:.2f}s, End: {segment['end']:.2f}s")
    for word in segment['words']:
        # Access the word using the 'word' key instead of 'text'
        print(f"  Word: '{word['word']}' - Start: {word['start']:.2f}s, End: {word['end']:.2f}s")

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
59 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 

  checkpoint = torch.load(fp, map_location=device)


Segment Start: 0.00s, End: 3.52s
  Word: ' সাকালে' - Start: 0.00s, End: 0.72s
  Word: ' বন্দরের' - Start: 0.72s, End: 1.32s
  Word: ' VTMIS' - Start: 1.32s, End: 2.34s
  Word: ' বন্দর' - Start: 2.34s, End: 3.16s
  Word: ' জেটি' - Start: 3.16s, End: 3.52s


In [31]:
# Step 1: Install whisper and ffmpeg
!pip install -q whisper openai-whisper
!sudo apt update && sudo apt install -y ffmpeg

import whisper
import subprocess

# Step 2: Load the Whisper model
model = whisper.load_model("small")

# Step 3: Transcribe the video with word-level timestamps
transcription = model.transcribe("/content/segment_12.mp4", language="bn", word_timestamps=True)

# Step 4: Segment and save the video for each word
input_video_path = "/content/segment_12.mp4"
output_folder = "/content/word_segments/"

# Create output folder if it doesn't exist
import os
os.makedirs(output_folder, exist_ok=True)

# Loop through each segment and word to create individual word video clips
for segment in transcription['segments']:
    for idx, word in enumerate(segment['words']):
        word_start = word['start']
        word_end = word['end']
        word_text = word['word']

        # Define the output path for each word video segment
        output_path = os.path.join(output_folder, f"{word_text}_{idx}.mp4")

        # Use ffmpeg to extract the segment for the specific word
        ffmpeg_command = [
            "ffmpeg",
            "-i", input_video_path,           # Input video
            "-ss", str(word_start),           # Start time
            "-to", str(word_end),             # End time
            "-c", "copy",                     # Copy codec (no re-encoding)
            output_path                       # Output file path
        ]

        # Run the ffmpeg command
        subprocess.run(ffmpeg_command)

        print(f"Saved word '{word_text}' segment as {output_path}")


[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
59 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 

OutOfMemoryError: CUDA out of memory. Tried to allocate 76.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 9.06 MiB is free. Process 30386 has 14.74 GiB memory in use. Of the allocated memory 14.08 GiB is allocated by PyTorch, and 535.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
# Step 1: Install whisper and ffmpeg
!pip install -q whisper openai-whisper
!sudo apt update && sudo apt install -y ffmpeg

import whisper
import subprocess
import os

# Step 2: Load the Whisper model
model = whisper.load_model("large")

# Step 3: Transcribe the video with word-level timestamps
transcription = model.transcribe("/content/segment_12.mp4", language="bn", word_timestamps=True)

# Define input video path
input_video_path = "/content/segment_12.mp4"

# Step 4: Process each word, create folders, and save video segments
output_base_folder = "/content/word_segments/"
os.makedirs(output_base_folder, exist_ok=True)  # Base folder for all segments

for segment in transcription['segments']:
    for idx, word in enumerate(segment['words']):
        word_text = word['word']
        word_start = word['start']
        word_end = word['end']

        # Calculate the duration of each word segment
        duration = word_end - word_start

        # Create a sanitized folder name for each word
        word_folder = os.path.join(output_base_folder, word_text)
        os.makedirs(word_folder, exist_ok=True)  # Create a folder for each word

        # Define output path for each video segment within the word's folder
        output_video_path = os.path.join(word_folder, f"{word_text}_{idx}.mp4")

        # Use ffmpeg to extract the segment for each word with re-encoding for compatibility
        ffmpeg_command = [
            "ffmpeg",
            "-ss", str(word_start),           # Start time
            "-i", input_video_path,           # Input video
            "-t", str(duration),              # Duration of the segment
            "-c:v", "libx264",                # Video codec for re-encoding
            "-preset", "ultrafast",           # Fast preset for quick processing
            "-an",                            # Disable audio (optional)
            output_video_path                 # Output file path
        ]

        # Execute the ffmpeg command to create the cropped video segment
        subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        print(f"Saved '{word_text}' segment to {output_video_path}")

print("All word segments have been saved in their respective folders.")


[33m0% [Working][0m            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connected to cloud.r-pr[0m                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state inform

  checkpoint = torch.load(fp, map_location=device)


Saved ' সাকালে' segment to /content/word_segments/ সাকালে/ সাকালে_0.mp4
Saved ' বন্দরের' segment to /content/word_segments/ বন্দরের/ বন্দরের_1.mp4
Saved ' VTMIS' segment to /content/word_segments/ VTMIS/ VTMIS_2.mp4
Saved ' বন্দর' segment to /content/word_segments/ বন্দর/ বন্দর_3.mp4
Saved ' জেটি' segment to /content/word_segments/ জেটি/ জেটি_4.mp4
All word segments have been saved in their respective folders.
