## Import Libraries

In [1]:
import torch
import librosa
import pandas as pd
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

## Load Pretrained Model

In [2]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load Audio in 16k

In [4]:
audio = AudioSegment.from_file("Audio/audio.flac", "flac")

## Length of Each Audio Snippet in ms

In [5]:
time_slice = 20000

upper_bound = int(len(audio)/time_slice)
snip_count = upper_bound + 1

## Split Audio File to Smaller Snippets

In [6]:
def SplitClips(audio, time_slice, upper_bound) : 
    start_time = 0
    end_time = start_time + time_slice

    for i in range(upper_bound):
        audio_snip = audio[start_time:end_time]
        audio_snip.export("Audiosnips/audio{0}.flac".format(i), "flac")
        start_time = end_time
        end_time = end_time + time_slice

    audio_snip = audio[end_time:]
    audio_snip.export("Audiosnips/audio{0}.flac".format(i+1), "flac")

## Transcription to Dataframe

In [7]:
def transcript_to_dataframe(snip_count):
    trans = {}
    for i in range(snip_count):
        speech, rate = librosa.load("Audiosnips/audio%d.flac" % i, sr=16000)
        input_values = tokenizer(speech, return_tensors='pt').input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        trans[i] = tokenizer.decode(predicted_ids[0])
    
    return (pd.DataFrame.from_dict(trans, orient="index"))

## Split and Transcript

In [8]:
SplitClips(audio, time_slice, upper_bound)

### Transcript 15min

In [9]:
%%time
transcript_dataframe = transcript_to_dataframe(45)

Wall time: 3min 41s


### Transcript 24min

In [10]:
%%time
transcript_dataframe = transcript_to_dataframe(72)

Wall time: 5min 26s


## Fetch Dataframe with Transcription

In [11]:
transcript_dataframe

Unnamed: 0,0
0,A YEAR AGO WERE HIT WITH A VIRUS THAT WAS MET ...
1,Y NINETEEN FEELED LIKE THEY WERE TAKEN IN ANOT...
2,ICE A YEAR FILL WITH THE LOSS OF LIFE AND THE ...
3,ERY AMERICAN THING TO DO IN FACT IT MAY BE THE...
4,FOR A VAXSENE AND SO MANY OF YOU AS HEMMINGWAY...
...,...
67,WERE ALSO BOUND TOGETHER BY THE HOPE AND THE P...
68,ONE AMERICA I BELIEVE WE CAN AND WE WILL WE'RE...
69,WE'LL COME OUT STRONGER WITH A RENEWED FAITH I...
70,DO WWHEN WE DO IT TOGETHER SO GOD BLESS YOU AL...
