# STT model Class

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
from moviepy.editor import VideoFileClip
from vosk import Model, KaldiRecognizer
import json
import os
import wave

class SpeechToTextFeatureExtractor:
    def __init__(self, embedding_model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", vosk_model_path="path/to/vosk/russian/model"):
        # Embedding model (changed to multilingual model)
        self.tokenizer = AutoTokenizer.from_pretrained(embedding_model_name, cache_dir="./cache")
        self.model = AutoModel.from_pretrained(embedding_model_name, cache_dir="./cache")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(device)
        
        # Vosk setup (use Russian model)
        self.vosk_model = Model(vosk_model_path)

    def extract_audio(self, video_path):
        # Extract audio from video
        video = VideoFileClip(video_path)
        audio = video.audio
        audio_path = "temp_audio.wav"
        audio.write_audiofile(audio_path, codec='pcm_s16le')
        return audio_path

    def audio_to_text(self, audio_path):
        # Convert audio to text using Vosk
        wf = wave.open(audio_path, "rb")
        rec = KaldiRecognizer(self.vosk_model, wf.getframerate())

        text = ""
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())
                text += result.get("text", "") + " "

        final_result = json.loads(rec.FinalResult())
        text += final_result.get("text", "")
        return text.strip()

    def get_embeddings(self, text):
        # Get embeddings from text
        inputs = self.tokenizer(text, return_tensors="pt",
                                padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings

    def extract_features(self, video_path):
        # Extract features from video
        audio_path = self.extract_audio(video_path)
        text = self.audio_to_text(audio_path)
        embeddings = self.get_embeddings(text)

        # Clean up temporary audio file
        os.remove(audio_path)

        return embeddings

# Test block:
english

In [3]:
import os
from tqdm import tqdm

# Create an instance of the extractor
extractor = SpeechToTextFeatureExtractor(vosk_model_path="/home/glooma/.cache/vosk/vosk-model-small-en-us-0.15")

# Path to the test video
# short video
test_video_path = "/home/glooma/Code/Python/ML/Hakatons/train_dataset_tag_video/videos/1e0a5151efc26a3a8e038e132f6b80f4.mp4"
# long video
# test_video_path = "/home/glooma/Code/Python/ML/Hakatons/train_dataset_tag_video/videos/0a148a3aa95e76ced2d993525badc986.mp4"
print(f"Processing video: {test_video_path}")

# Extract audio
audio_path = extractor.extract_audio(test_video_path)

# Convert audio to text with progress bar
wf = wave.open(audio_path, "rb")
rec = KaldiRecognizer(extractor.vosk_model, wf.getframerate())

text = ""
total_frames = wf.getnframes()
chunk_size = 4000
progress_bar = tqdm(total=total_frames, unit='frames', desc="Processing audio")

while True:
    data = wf.readframes(chunk_size)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        result = json.loads(rec.Result())
        text += result.get("text", "") + " "
    progress_bar.update(len(data))

final_result = json.loads(rec.FinalResult())
text += final_result.get("text", "")
text = text.strip()

progress_bar.close()
print(f"Transcribed text: {text[:100]}...")  # Print first 100 characters

# Get embeddings from the transcribed text
embeddings = extractor.get_embeddings(text)
print(f"Embeddings shape: {embeddings.shape}")
print(f"First few values of embeddings: {embeddings[0][:5]}")

# Clean up temporary files
if os.path.exists(audio_path):
    os.remove(audio_path)
    print(f"Temporary audio file removed: {audio_path}")

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /home/glooma/.cache/vosk/vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from /home/glooma/.cache/vosk/vosk-model-small-en-us-0.15/graph/HCLr.fst /home/glooma/.cache/vosk/vosk-model-small-en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:308) Loading winfo /home/glooma/.cache/vosk/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int


Processing video: /home/glooma/Code/Python/ML/Hakatons/train_dataset_tag_video/videos/0a148a3aa95e76ced2d993525badc986.mp4
MoviePy - Writing audio in temp_audio.wav


chunk:   9%|▉         | 9778/108839 [00:02<00:28, 3448.22it/s, now=None]

KeyboardInterrupt: 

chunk:   9%|▉         | 9893/108839 [00:19<00:28, 3448.22it/s, now=None]

# Test block
russian

In [7]:
import os
from tqdm import tqdm
import wave
import json
from vosk import KaldiRecognizer  # Add this import


# Путь к русской модели Vosk
vosk_model_path = "/home/glooma/Code/Python/ML/Hakatons/hackathon_video_tagging/model/extractors/speech_to_text/vosk-model-small-ru-0.22"

# Проверка существования модели
if not os.path.exists(vosk_model_path):
    print(f"Модель не найдена по пути: {vosk_model_path}")
    print("Пожалуйста, скачайте модель с https://alphacephei.com/vosk/models")
    print("и распакуйте ее в указанную директорию.")
    raise Exception("Модель Vosk не найдена")

# Create an instance of the extractor
extractor = SpeechToTextFeatureExtractor(vosk_model_path=vosk_model_path)

# Path to the test video
# short video
test_video_path = "/home/glooma/Code/Python/ML/Hakatons/train_dataset_tag_video/videos/1e0a5151efc26a3a8e038e132f6b80f4.mp4"
# long video
# test_video_path = "/home/glooma/Code/Python/ML/Hakatons/train_dataset_tag_video/videos/0a148a3aa95e76ced2d993525badc986.mp4"

print(f"Обработка видео: {test_video_path}")

# Extract audio
audio_path = extractor.extract_audio(test_video_path)

# Convert audio to text with progress bar
wf = wave.open(audio_path, "rb")
rec = KaldiRecognizer(extractor.vosk_model, wf.getframerate())  # Changed this line

text = ""
total_frames = wf.getnframes()
chunk_size = 4000
progress_bar = tqdm(total=total_frames, unit='фреймы', desc="Обработка аудио")

while True:
    data = wf.readframes(chunk_size)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        result = json.loads(rec.Result())
        text += result.get("text", "") + " "
    progress_bar.update(len(data))

final_result = json.loads(rec.FinalResult())
text += final_result.get("text", "")
text = text.strip()

progress_bar.close()
print(f"Транскрибированный текст: {text[:100]}...")  # Вывод первых 100 символов

# Get embeddings from the transcribed text
embeddings = extractor.get_embeddings(text)
print(f"Форма эмбеддингов: {embeddings.shape}")
print(f"Первые несколько значений эмбеддингов: {embeddings[0][:5]}")

# Clean up temporary files
if os.path.exists(audio_path):
    os.remove(audio_path)
    print(f"Временный аудиофайл удален: {audio_path}")

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /home/glooma/Code/Python/ML/Hakatons/hackathon_video_tagging/model/extractors/speech_to_text/vosk-model-small-ru-0.22/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from /home/glooma/Code/Python/ML/Hakatons/hackathon_video_tagging/model/extractors/speech_to_text/vosk-model-small-ru-0.22/graph/HCLr.fst /home/glooma/Code/Python/ML/Hakatons/hackathon_video_tagging/model/extractors

Обработка видео: /home/glooma/Code/Python/ML/Hakatons/train_dataset_tag_video/videos/1e0a5151efc26a3a8e038e132f6b80f4.mp4
MoviePy - Writing audio in temp_audio.wav


                                                                    

MoviePy - Done.


Обработка аудио: 5115600фреймы [00:08, 570691.79фреймы/s]                         

Транскрибированный текст: группу юту кода стоишь не руках это было слышно смешной и стоишь у порах ну  ну да   мы уже дорога ч...
Форма эмбеддингов: torch.Size([1, 384])
Первые несколько значений эмбеддингов: tensor([ 0.0685,  0.0816, -0.1150, -0.2033, -0.0663])
Временный аудиофайл удален: temp_audio.wav



