# Automatic Transcription by Wav2Vec2, HuBERT, & Speech2Text

In [None]:
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, HubertForCTC, Speech2TextProcessor, Speech2TextForConditionalGeneration

The `Wav2Vec2Processor` can be used for tokenization as well as feature extraction depending on the `__call__`. More info [here](https://huggingface.co/transformers/model_doc/wav2vec2.html#transformers.Wav2Vec2Processor.__call__)

In [None]:
# load the models and their processor
processor_wav = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
model_wav = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
processor_hub = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model_hub = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
model_s2t = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr")
processor_s2t = Speech2TextProcessor.from_pretrained("facebook/s2t-large-librispeech-asr")

In [None]:
# define speech-to-text function for wav2vec2 
def wav_asr_transcript(audio_file):
    transcript = ""

    # Stream over 20 seconds chunks
    stream = librosa.stream(
        audio_file, block_length=20, frame_length=16000, hop_length=16000
    )

    for speech in stream:
        if len(speech.shape) > 1:
            speech = speech[:, 0] + speech[:, 1]

        input_values = processor_wav(speech, sampling_rate=16_000, return_tensors="pt").input_values
        with torch.no_grad():
            logits = model_wav(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor_wav.batch_decode(predicted_ids)[0]
        transcript += transcription.lower() + ". "

    return transcript


In [None]:
# define speech-to-text function for Hubert 
def hub_asr_transcript(audio_file):
    transcript = ""

    # Stream over 20 seconds chunks
    stream = librosa.stream(
        audio_file, block_length=20, frame_length=16000, hop_length=16000
    )

    for speech in stream:
        if len(speech.shape) > 1:
            speech = speech[:, 0] + speech[:, 1]

        input_values = processor_hub(speech, sampling_rate=16_000, return_tensors="pt").input_values
        with torch.no_grad():
            logits = model_hub(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor_hub.batch_decode(predicted_ids)[0]
        transcript += transcription.lower() + ". "

    return transcript


In [None]:
# define speech-to-text function for Speech2Text
def s2t_asr_transcript(audio_file):
    transcript = ""

    # Stream over 20 seconds chunks
    stream = librosa.stream(
        audio_file, block_length=20, frame_length=16000, hop_length=16000
    )

    for speech in stream:
        if len(speech.shape) > 1:
            speech = speech[:, 0] + speech[:, 1]

        inputs = processor_s2t(speech, sampling_rate=16_000, return_tensors="pt")
        generated_ids = model_s2t.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
        transcription = processor_s2t.batch_decode(generated_ids)[0]
        transcript += transcription + ". "

    return transcript

## Transcribing Entire Earnings Call

In [None]:
wav2vec2_full_tr = wav_asr_transcript(r"../../../speech2text/apple/Audio/2021-Oct-29-AAPL.OQ-139435924054.flac")
hubert_full_tr = hub_asr_transcript(r"../../../speech2text/apple/Audio/2021-Oct-29-AAPL.OQ-139435924054.flac")
s2t_full_tr = s2t_asr_transcript(r"../../../speech2text/apple/Audio/2021-Oct-29-AAPL.OQ-139435924054.flac")

In [None]:
with open("../../../speech2text/apple/wav2vec2_2021-Oct-29-AAPL.OQ-139435924054.txt",'wt') as file:
    file.write(wav2vec2_full_tr)