# automatically annotating silences in a video

this notebook is here to automatically annotate any silence lasting more than some duration (e.g. 5 seconds) from a video or audio file.


In [None]:
## preliminary definitions.
## you will have to install the audiosegment python library 
# !pip install audiosegment


import audiosegment
import matplotlib.pyplot as plt
import numpy as np
import math

def format_time(a):
    hms=[str( a//3600 ),str( (a%3600)//60 ),str( a%60 )]
    for i in range(len(hms)):
        if len(hms[i])==1:
            hms[i] = '0' + hms[i]
    
    return ':'.join( hms )

def getStretch( R ):
    """ finds contiguous windows where a condition is satistied """
    previous = False
    windows = []
    for i in range(0,len(R)):
        if R[i]:
            if not previous :
                windows.append([i,i])
            else:
                windows[-1][-1]=i
        previous=R[i]
    return windows

def get_mean_every( data , every ):
    return np.array([ np.mean( np.abs( data[ i*every :(i+1)*every ] ) ) for i in range( math.ceil(len(data)/every) )])

We read the audio and simplify it to get the average sound amplitude at each second 
(if the video/audio is long, this will take some time, maybe up to a minute so be patient):

In [None]:

## put here the input file 
## NB: in example I use a m4a. but this works with a mp4 video too:
input_file = "toy_data/audio2803562588.m4a"

output_file = "cutout_annotations.csv"

In [None]:
%%time
## reading the audio
seg = audiosegment.from_file(input_file)
print( seg )
## from audio segment to numpy arrays:
A = seg.to_numpy_array()
Ampl_second = get_mean_every( np.abs(A) , seg.frame_rate )

A silence is whenever the amplitude gets below 10 (I found 10 works well, use higher values if the microphone adds some noise or something).

We want to remove every silences that last at least 5 seconds:

In [None]:
## any silence lasting longer than that number (in seconds) will be annotated:
SILENCE_DURATION_THRESHOLD = 5 


DETECTION_THRESHOLD = 10 # define amplitude under which we say there is silence


W = getStretch( Ampl_second< DETECTION_THRESHOLD )

starts = []
stops = []


print( 'start' ,'\t', 'duration'," \tcut out annotation")
for w in W:
    if w[1]-w[0] > SILENCE_DURATION_THRESHOLD:
        print( w[0] ,'\t', w[1]-w[0],
              "\t\tcut out {} to {}".format( format_time(w[0]) , format_time(w[1]) ))


In [None]:
## writing the cutout instruction to a csv file:
with open( output_file , 'w' ) as OUT:
    print( 'source' ,'start','stop','destination' , sep=',' , file=OUT)
    for w in W:
        if w[1]-w[0] > SILENCE_DURATION_THRESHOLD:
            print( input_file, format_time(w[0]) , format_time(w[1]) , 'OUT' ,
                  sep=',' , file=OUT)

In [None]:
## maybe you want to plot specific streches 
## of the signal to check the automatic annotation :
a,b = 3390,3410
plt.plot( A[seg.frame_rate * a: seg.frame_rate * b]  ) ## full signal
plt.plot( seg.frame_rate*0.5 + seg.frame_rate * np.arange(b-a) , 1000 * np.log10( Ampl_second[a: b] ) ) # simplified to 1/second