In [None]:
pip install moviepy SpeechRecognition transformers torch


Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl.metadata (28 kB)
Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.10.4


In [None]:
pip install pydub


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import torch
import os
import numpy as np
import h5py
import speech_recognition as sr  # For speech-to-text
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import moviepy.editor as mp
import torch
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
import os

In [None]:


def convert_audio_to_text(video_file):
    print('... extracting audio and converting to text')

    # Extract audio from the video
    video = mp.VideoFileClip(video_file)
    audio_file = video_file.replace(".mkv", ".wav")
    video.audio.write_audiofile(audio_file)

    # Load audio file for processing
    audio = AudioSegment.from_wav(audio_file)
    recognizer = sr.Recognizer()

    timestamps = []
    text_data = []


    segment_duration_ms = 10000  # 10 seconds
    total_duration_ms = len(audio)  # Total duration of the audio in milliseconds

    # Process audio in chunks
    for start_time_ms in range(0, total_duration_ms, segment_duration_ms):
        end_time_ms = min(start_time_ms + segment_duration_ms, total_duration_ms)
        segment = audio[start_time_ms:end_time_ms]
        segment_file = "segment.wav"
        segment.export(segment_file, format="wav")

        with sr.AudioFile(segment_file) as source:
            try:
                print(f"... processing segment {start_time_ms / 1000} to {end_time_ms / 1000} seconds")
                audio_chunk = recognizer.record(source)
                transcript = recognizer.recognize_google(audio_chunk)

                # Record timestamp and text
                timestamps.append(start_time_ms / 1000)
                text_data.append(transcript)

            except sr.UnknownValueError:
                print(f"... segment {start_time_ms / 1000} to {end_time_ms / 1000} could not be understood")
            except sr.RequestError as e:
                print(f"Could not request results from Google Speech Recognition service; {e}")

        # Clean up temporary segment file
        os.remove(segment_file)

    # Print results
    print(f"Timestamps: {timestamps}")
    print(f"Text Data: {text_data}")

    return timestamps, text_data


def get_text_embeddings(text_data):
    print('... converting text to embeddings')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    embeddings = []

    for sentence in text_data:
        inputs = tokenizer(sentence, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        sentence_embedding = outputs.pooler_output.squeeze().numpy()
        embeddings.append(sentence_embedding)

    return np.array(embeddings)  # This should be 2D: (num_texts, embedding_dim)

def convert_text_to_embedding(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.pooler_output.squeeze().numpy().reshape(1, -1)  # Ensure it's 2D

# Step 3: Search Keywords and Jump to Video
def search_video(keyword, text_embeddings, timestamps):
    from sklearn.metrics.pairwise import cosine_similarity

    if len(text_embeddings) == 0:
        raise ValueError("The embeddings array is empty. Ensure text was correctly converted to embeddings.")

    keyword_embedding = convert_text_to_embedding(keyword)

    if text_embeddings.ndim == 1:
        text_embeddings = text_embeddings.reshape(1, -1)
    elif text_embeddings.ndim > 2:
        raise ValueError("text_embeddings has more than 2 dimensions")

    similarities = cosine_similarity(keyword_embedding, text_embeddings)

    best_idx = np.argmax(similarities)

    return timestamps[best_idx]

def jump_to_timestamp(video_file, timestamp):
    print(f"... jumping to timestamp: {timestamp} seconds")
    video = mp.VideoFileClip(video_file)
    end_time = video.duration
    video.subclip(timestamp, end_time).preview()

# Main Process
if __name__ == '__main__':
    video_file = 'video_file'
    timestamps, text_data = convert_audio_to_text(video_file)


    text_embeddings = get_text_embeddings(text_data)


    keyword = "keyword"
    timestamp = search_video(keyword, text_embeddings, timestamps)

    print(f"Timestamp to jump to for keyword '{keyword}': {timestamp} seconds")

    jump_to_timestamp(video_file, timestamp)


... extracting audio and converting to text
MoviePy - Writing audio in /content/drive/MyDrive/Rick and Morty S03E03 {English} 720p WEB-DL ESub [BollyFlix].wav




MoviePy - Done.
... processing segment 0.0 to 10.0 seconds
... processing segment 10.0 to 20.0 seconds
... processing segment 20.0 to 30.0 seconds
... processing segment 30.0 to 40.0 seconds
... processing segment 40.0 to 50.0 seconds
... processing segment 50.0 to 60.0 seconds
... processing segment 60.0 to 70.0 seconds
... segment 60.0 to 70.0 could not be understood
... processing segment 70.0 to 80.0 seconds
... segment 70.0 to 80.0 could not be understood
... processing segment 80.0 to 90.0 seconds
... segment 80.0 to 90.0 could not be understood
... processing segment 90.0 to 100.0 seconds
... processing segment 100.0 to 110.0 seconds
... processing segment 110.0 to 120.0 seconds
... processing segment 120.0 to 130.0 seconds
... processing segment 130.0 to 140.0 seconds
... processing segment 140.0 to 150.0 seconds
... processing segment 150.0 to 160.0 seconds
... processing segment 160.0 to 170.0 seconds
... processing segment 170.0 to 180.0 seconds
... processing segment 180.0 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Timestamp to jump to for keyword 'jaguar': 1010.0 seconds
... jumping to timestamp: 1010.0 seconds
