# Automatic video cutter

## Import libraries

In [1]:
import os
import sys
import time
import wave
import json

import moviepy.editor as mp
from vosk import Model, KaldiRecognizer, SetLogLevel
# to install with jupyter uncomment and run next cell

In [2]:
# !pip install moviepy vosk

## Loading a vosk model

In [3]:
# path to vosk model downloaded from
# https://alphacephei.com/vosk/models
model_path = "models/vosk-model-ru-0.10"

if not os.path.exists(model_path):
    print("Please download the model from" +
          f"https://alphacephei.com/vosk/models and unpack as {model_path}")
    sys.exit()

print(f"Reading your vosk model '{model_path}'...")
model = Model(model_path)
print(f"'{model_path}' model was successfully read")

Reading your vosk model 'models/vosk-model-ru-0.10'...
'models/vosk-model-ru-0.10' model was successfully read


# Set video filename

In [4]:
# path to video file to convert
# any extensions supported by ffmpeg: 
# .ogv, .mp4, .mpeg, .avi, .mov, .mkv etc.
video_path = "videos/test.mp4"
# new filename to save final video
result_path = video_path[:-4] + "_processed.mp4"

# temporary filename for audiofile (will be deleted)
audio_path = video_path[:-3] + "wav"

## Read video and convert it to mono audio

In [5]:
# if videofile exists
if not os.path.exists(video_path):
    print(f"File {video_path} doesn't exist")
    sys.exit()

# read video
clip = mp.VideoFileClip(video_path)

In [6]:
# convert video to audio
# ffmpeg_params=["-ac", "1"] parameter convert audio to mono format
clip.audio.write_audiofile(audio_path, ffmpeg_params=["-ac", "1"])

chunk:  39%|███▉      | 581/1479 [00:00<00:00, 3027.10it/s, now=None]

MoviePy - Writing audio in videos/test.wav


                                                                      

MoviePy - Done.




## Speech Recognition with vosk model

In [7]:
def recognize_audio_vosk(audio_path, model):
    '''
    Recognize audio with vosk model.
    Language of the recognition depends on model.
    Returns list of JSON dictionaries. Each of them has the following structure:

    {'result': [{'conf': 0.849133, # confidence
                 'end': 4.5, # end time
                 'start': 4.05, # start time
                 'word': 'test'}, # recognized word

                 # the same for other words in sentence
                 ], 
     'text': 'test'}

    Parameters:
        audio_path (str): path to the audio file to recognize. Must be WAV format mono PCM
        model: vosk model. Must be loaded with `model = Model(model_path)` command

    Returns:
        results (array): list of JSON dictionaries from vosk model
    '''

    # check if audio is mono wav
    wf = wave.open(audio_path, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        print("Audio file must be WAV format mono PCM")
        sys.exit()

    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    print('Starting to convert audio to text. It may take some time...')
    start_time = time.time()

    results = []
    # recognize speech using vosk model
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            part_result = json.loads(rec.Result())
            results.append(part_result)

    part_result = json.loads(rec.FinalResult())
    results.append(part_result)

    # forming a final string from the words
    text = ''
    for r in results:
        text += r['text'] + ' '

    time_elapsed = time.strftime('%H:%M:%S',
                                 time.gmtime(time.time() - start_time))
    print(f'Done! Elapsed time = {time_elapsed}')

    print("\tVosk thinks you said:\n")
    print(text)

    return results

In [8]:
results = recognize_audio_vosk(audio_path=audio_path, 
                               model=model)

Starting to convert audio to text. It may take some time...
Done! Elapsed time = 00:00:21
	Vosk thinks you said:

баре видео и меняю крэша анал и мелтон мэтью петра ноутбук ты шёл и начало на с м оперся на алтарь бишон видео с рулетками юпитера ноутбук    круиз не пайтон скрыт ы не конец сел но вами а вот у ноутбука фронт то что волны не то импорт сам пайтон 


In [9]:
# delete audio
os.remove(audio_path)

## Search for control words and timestamps

In [10]:
def get_segments_from_audio(results, start_word='начало', end_word='конец', offset=0.5):
    '''
    Parse list of JSON dictionaries for 'start_word' and 'end_word' 
    and returns 'segments' - list of tuples, where each turple is
    (start_time of start_word - offset, end_time of end_word + offset)

    Parameters:
        results (array): list of JSON dictionaries from vosk model. 
                         Received from `recognize_audio_vosk()` function
        start_word (str): control word that signals the beginning of the video fragment to be cut
        end_word (str): control word that signals the ending of the video fragment to be cut
        offset (float): offset in seconds. Number being subtracted from 'start_time' for 'start_word' 
                        and added to 'end_time' for 'end_word'

    Returns:
        segments (array): list of tuples (start_time, end_time)
    '''

    print("Starting the search for control words...")

    # lists for start and end times
    starts = []
    ends = []

    for record in results:
        if start_word in record['text'] or end_word in record['text']:
            # the sentence contains 'start_word' or 'end_word'
            for word_object in record['result']:
                # cycle by words in a sentence
                if word_object['word'] == start_word:
                    starts.append(word_object['start'] - offset)
                if word_object['word'] == end_word:
                    ends.append(word_object['end'] + offset)

    # from starts and ends to segments
    # starts = [1, 3], ends = [2, 4] ->
    # segments = (0, 1), (2, 3), (4, None)

    segments = []
    length = max(len(starts), len(ends))
    for i in range(length + 1):
        if i == 0:
            segments.append((0, starts[0]))
        elif i == length:
            segments.append((ends[i-1], None))
        else:
            # intermediate values
            segments.append((ends[i-1], starts[i]))
    print("The search of control words is completed. Got the following array of segments: \n")
    print(segments)

    return segments

In [11]:
segments = get_segments_from_audio(results)

Starting the search for control words...
The search of control words is completed. Got the following array of segments: 

[(0, 11.65), (57.56, None)]


## Video Processing

In [12]:
def crop_video_by_segments(video, segments, result_path):
    '''
    Crop video according to 'segments' list and
    save final video to 'result_path'

    Parameters:
        video: moviepy.editor.VideoFileClip object
        segments (array): list of tuples (start_time, end_time).
                          Received from `get_segments_from_audio()` function
        result_path (str): new filename to save final video

    Returns:
        None
    '''

    print("Starting the video processing...")

    clips = []  # list of all video fragments

    for start_seconds, end_seconds in segments:
        # crop a video clip and add it to list
        c = video.subclip(start_seconds, end_seconds)
        clips.append(c)

    final_clip = mp.concatenate_videoclips(clips)
    final_clip.write_videofile(result_path)

    print("The video processing is completed")

In [13]:
crop_video_by_segments(video=clip, 
                       segments=segments,
                       result_path=result_path)

Starting the video processing...


chunk:  52%|█████▏    | 243/467 [00:00<00:00, 1343.45it/s, now=None]

Moviepy - Building video videos/test_processed.mp4.
MoviePy - Writing audio in test_processedTEMP_MPY_wvf_snd.mp3


t:   3%|▎         | 16/635 [00:00<00:03, 156.86it/s, now=None]      

MoviePy - Done.
Moviepy - Writing video videos/test_processed.mp4



                                                               

Moviepy - Done !
Moviepy - video ready videos/test_processed.mp4
The video processing is completed
