In [1]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
import IPython.display as ipd
import os

In [2]:
#read the audio
audio_full = AudioSegment.from_file('audio.m4a') # also works for .mp4 and .wav

In [3]:
# split into parts based on prolonged silence
# parameters min_silence_len and silence_tresh may need adjustement: depends on the background noise,
# the tempo of the speech etc.
parts = split_on_silence(audio_full, min_silence_len = 2500, silence_thresh = -32, keep_silence = True)

In [4]:
print('total: ', len(parts), ' parts')

total:  5  parts


In [5]:
parts[0] # listen to some of the parts to see if the splitting works as intended

In [6]:
# save all parts as audiofiles 
i=0
length_of_parts = 0
total_length = audio_full.duration_seconds

for audio_item in parts:  
  
  length_of_parts += audio_item.duration_seconds
  name_add = str(i)
  if len(name_add) == 1: 
    name_add = '0' + name_add + '.mp4'
  else: 
    name_add = name_add + '.mp4'
  audio_item.export("./parts/part" + name_add, format ="mp4") # the working directory should have an empty "parts" folder
  i += 1

In [7]:
# A sanity check: length should be equal since we kept the silent parts
print('Total length: ',total_length )
print('Sum of chunks length: ', length_of_parts)

Total length:  59.392
Sum of chunks length:  59.391999999999996


In [8]:
ipd.Audio('./parts/part00.mp4') # check that files were saved correctly 

In [9]:
import torch
import librosa
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor, # processor combines feature extractor and tokenizer 
)


In [None]:
#model_name = "facebook/wav2vec2-large-xlsr-53-french"

model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-french" # I find this model to be more accurate

model = Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)

In [None]:
import os
list_of_scripts = []
for f in sorted(os.listdir('./parts')): # for every file in the sorted list of files in "parts" directory
    if f[-3:] == 'mp4': # we skip service files and works only with audio pieces
      array_audio, rate = librosa.load('./parts/' +f, sr = 16000) # read the audio

      # feed the array to the tokenizer to prepare for the model
      inputs = processor(array_audio, sampling_rate=16000, return_tensors="pt", padding=True) 

      # obtain the classification scores 
      with torch.no_grad():
        logits = model(inputs['input_values'], attention_mask=inputs['attention_mask']).logits

      # get the prediction
      pred_ids = torch.argmax(logits, dim=-1)

      # decode the prediction
      translation = processor.batch_decode(pred_ids) 

      # "prettify' the translation: lowercase everything except the first letter and add "." in the end. 
      sentence = translation[0][0] + translation[0][1:].lower() +'.' 
      list_of_scripts.append(sentence)

In [17]:
from autocorrect import Speller
spell = Speller('fr') # we work in french

with open('transcript.txt', 'w') as f:
    for item in list_of_scripts:
        spell_corrected = spell(item) # check and correct the spelling of each sentence
        print(spell_corrected) # print the corrected sentence
        f.write("%s\n" % spell_corrected) # write the corrected sentence into the txt file

dictionary for this language not found, downloading...
__________________________________________________
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
done!
Danseurs tout le monde je vais lire un article équipe dia sur les pingouins.
C'est parti le terme pingouin et son équivalent anglais ou encore néerlandais désigne à l'origine l'espèce des oiseaux inaptovols le grand pingouin pinguinius impunis qui vivait dans le nord de l'océan-atlantique et qui appartenant à la famil.
Des alcidéesces termes furent ensuite utilisés par les premiers européens à découvrir des manches notamment le marchand de magellan et le manchot du cap du fait de leur ressemblance avec le grand pingouin et de leur incapacité à voler on a alors commencé à parler en français comme dans les autres langues des pingouins du nord et des pingouins du sudcependant.
Les scientifiquement rapidement compris qu'il s'agissait d'oiseaux n'ayant pas de lyonne parentés avec le grand pingouin et c'est l'ornithologue français 