# Automatic video cutter

https://alphacephei.com/vosk/


https://github.com/alphacep/vosk-api
   

## Import libraries

In [1]:
# install nbextension for jupyter notebook
# !pip install jupyter_contrib_nbextensions && jupyter contrib nbextension install

In [2]:
import os
import sys
import time

import wave
import json

import moviepy.editor as mp
from vosk import Model, KaldiRecognizer, SetLogLevel
# !pip install moviepy vosk

## Loading a vosk model

In [3]:
# path to vosk model downloaded from
# https://alphacephei.com/vosk/models

# vosk-model-ru-0.10
# vosk-model-en-us-0.21
model_path = "models/vosk-model-ru-0.10"

if not os.path.exists(model_path):
    print(f"Please download the model from https://alphacephei.com/vosk/models and unpack as {model_path}")
    sys.exit()

print(f"Reading your vosk model '{model_path}'...")
model = Model(model_path)
print(f"'{model_path}' model was successfully read")

Reading your vosk model 'models/vosk-model-ru-0.10'...
'models/vosk-model-ru-0.10' model was successfully read


# Set video filename

In [4]:
# path to video file to convert
# extensions supported:
# 
video_path = "videos/2021-10-01_12-56-02.mkv"
# new video filename to save
result_path = video_path[:-4] + "_processed.mp4" #+ # video_path[-3:]

audio_path = video_path[:-3] + "wav"
# text_path = video_path[:-3] + "txt"

## Read video and convert to mono audio

In [5]:
# read video
clip = mp.VideoFileClip(video_path) 

In [6]:
# convert video to audio
# ffmpeg_params=["-ac", "1"] parameter convert audio to mono format
clip.audio.write_audiofile(audio_path, ffmpeg_params=["-ac", "1"])

chunk:  22%|██▏       | 204/911 [00:00<00:00, 1946.75it/s, now=None]

MoviePy - Writing audio in videos/2021-10-01_12-56-02.wav


                                                                    

MoviePy - Done.




## Speech Recognition with vosk model

In [7]:
def recognize_audio_vosk(audio_path, model):
    '''
    Recognize audio with vosk model.
    Language of the recognition depends on vosk model.

    Parameters:
        audio_path (str): path to the audio file to recognize. Must be WAV format mono PCM
        model: vosk model. Must be loaded with `model = Model(model_path)` command

    Returns:
        results (array): list of 
    '''
    # check if audio if mono wav
    wf = wave.open(audio_path, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        print("Audio file must be WAV format mono PCM.")
        sys.exit()

    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    print('Starting to convert audio to text. It may take some time...')
    start_time = time.time()

    results = []
    # recognize speech using vosk model
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            part_result = json.loads(rec.Result())
            results.append(part_result)

    part_result = json.loads(rec.FinalResult())
    results.append(part_result)

    # forming a final string from the words
    text = ''
    for r in results:
        text += r['text'] + ' '

    time_elapsed = time.strftime('%H:%M:%S',
                                 time.gmtime(time.time() - start_time))
    print(f'Done! Elapsed time = {time_elapsed}')

    print("\tVosk thinks you said:\n")
    print(text)

    return results

In [8]:
results = recognize_audio_vosk(audio_path=audio_path, model=model)

Starting to convert audio to text. It may take some time...
Done! Elapsed time = 00:00:20
	Vosk thinks you said:

в баре огонь ток болт вы не выкрашен бла бла бла бла бла бла как микс от центра уэйд сми окон мае че патруль ноутбук тушу льюсом кот туши луисом кор начало весь мой персонал не тратят это тает трубы клайд антенн сей конец хайкен и и и а потом 


In [None]:
# delete audio

# TODO

## Search for control words and timestamps

In [9]:
def get_segments_from_audio(results, start_word='начало', end_word='конец', offset=0.5):
    '''
    
    Parameters:
        results (array): Received from `recognize_audio_vosk()` function
        start_word (str): control word that signals the beginning of the video fragment to be cut
        end_word (str):   control word that signals the ending of the video fragment to be cut
        offset (float): offset in seconds. Number being subtracted from 'start_time' for 'start_word' 
                        and added to 'end_time' for 'end_word'
                        
    Returns:
        segments (array): list of tuples of two elements
    '''
    
    print("Starting the search for control words...")
    # lists for start and end times
    starts = []
    ends = []
    
    for record in results:
        # print(record['text'])
        if start_word in record['text'] or end_word in record['text']:
            # the sentence contains 'start_word' or 'end_word'
            for word_object in record['result']:
                # cycle by words in a sentence
                if word_object['word'] == start_word:
                    starts.append(word_object['start'] - offset)
                if word_object['word'] == end_word:
                    ends.append(word_object['end'] + offset)
                
    # from starts and ends to segments
    # starts = [1, 3], ends = [2, 4] ->
    # segments = (0, 1), (2, 3), (4, None)

    segments = []
    length = max(len(starts), len(ends))
    for i in range(length + 1):
        if i == 0:
            segments.append((0, starts[0]))
        elif i == length:
            segments.append((ends[i-1], None))
        else:
            # intermediate values
            segments.append((ends[i-1], starts[i]))
    print("The search of control words is completed. Got the following array of segments: \n")
    print(segments)
    
    return segments

In [10]:
segments = get_segments_from_audio(results)

Starting the search for control words...
The search of control words is completed. Got the following array of segments: 

[(0, 22.3), (34.34, None)]


## Video Processing

In [11]:
def crop_video_by_segments(video, segments, result_path, verbose=False):
    '''
    
    Parameters:
        video :
        segments : Received from `get_segments_from_audio()` function
        verbose (bool): Default False.
        
    Returns:
        None
    '''
    
    clips = [] # list of all video fragments

    for start_seconds, end_seconds in segments:
        # crop a video clip and add it to list
        c = video.subclip(start_seconds, end_seconds)
        clips.append(c)

    final_clip = mp.concatenate_videoclips(clips)
    final_clip.write_videofile(result_path)
    
    if verbose:
        pass

In [12]:
crop_video_by_segments(video=clip, segments=segments, result_path=result_path)

chunk:   0%|          | 2/645 [00:00<00:37, 17.25it/s, now=None]

Moviepy - Building video videos/2021-10-01_12-56-02_processed.mp4.
MoviePy - Writing audio in 2021-10-01_12-56-02_processedTEMP_MPY_wvf_snd.mp3


t:   1%|          | 9/1754 [00:00<00:19, 89.96it/s, now=None]       

MoviePy - Done.
Moviepy - Writing video videos/2021-10-01_12-56-02_processed.mp4



                                                                

Moviepy - Done !
Moviepy - video ready videos/2021-10-01_12-56-02_processed.mp4
