<a href="https://colab.research.google.com/github/StwayneXG/job-application-tasks/blob/main/%5BHuggingFace%5D_Afiniti_Task_Real_time_Transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install Dependencies
# Download these to:
# 1. Load models
# 2. Use microphone (cannot use mic on colab)
# 3. Load evaluation functions

# Only for Linux based systems
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg > /dev/null 

!pip install transformers > /dev/null
!pip install sounddevice > /dev/null
!pip install jiwer > /dev/null
!wget -O speech.wav https://github.com/EN10/DeepSpeech/blob/master/man1_wb.wav?raw=true

In [None]:
#@title Load Model
from transformers import WhisperProcessor, WhisperForConditionalGeneration

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

In [None]:
#@title Define Audio Parameters

sample_rate = 16000             # Standard + the model we used was trained on 16 KHz Sampling Rate
chunk_size = 32*1024            # Require a decent chunk of the audio to get context and avoid getting half words

In [None]:
#@title Set up Input Stream
import sounddevice as sd
import numpy as np

use_mic = False                           # Set to True to test on PC and use mic
predicted_transcript = ""                 # Will contain complete transcription. Stored to be later used as context by question-answering model

if use_mic:
    has_audio_input = True
    def transcript_audio(indata, frames, time, status):                         # Callback function after recording each chunk
        if status:
            print('Error:', status)

        global has_audio_input
        global predicted_transcript
        has_audio_input = np.max(indata) > 0                                    # To stop if no audio is heard

        indata = indata.astype(np.float64) / np.iinfo(np.int16).max             # Model we're using requires audio signal in Float64 dtype

        # Extract Features
        input_features = processor(np.array(indata, dtype="float64"), 
                                   sampling_rate=sample_rate, 
                                   return_tensors="pt").input_features
        # Generate Output
        predicted_ids = model.generate(input_features)
        # Decode to get text
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        # Store transcription
        predicted_transcript = f"{predicted_transcript} {transcription}"
        print(transcription)

    # Stop with Ctrl + C after speaking the sentence given below. (if it doesn't automatically stop)
    ground_truth = r""" In a bustling city, a young artist found solace in her attic studio, 
                        expressing her imagination through vibrant art. Her creations became 
                        a sanctuary, touching hearts and inspiring others. Amidst the chaos, 
                        she found peace and purpose in her creative haven.""".lower()

    # Input audio using the microphone stream
    with sd.InputStream(callback=transcript_audio, channels=1, samplerate=sample_rate, blocksize=chunk_size, dtype='int16'):
        while True:
            if has_audio_input:
                continue
            print("stream ended")
            break

else:
    # To test on colab, I used a wave file
    import wave
    ground_truth = "in the course of a december tour in yorkshire i rode for a long distance in one of the public coaches on the day preceding christmas"
    
    # Open file, load audio signal and get Sampling Rate
    wav_file = wave.open('speech.wav', 'rb') 
    sample_rate = wav_file.getframerate()
    
    while True:
        # Read chunk size (simulating our requirement of infinite stream)
        frames = wav_file.readframes(chunk_size)
        if not frames:
            break

        # Get data and convert to Float64
        indata = np.frombuffer(frames, dtype=np.int16)
        indata = indata.astype(np.float64) / np.iinfo(np.int16).max

        # Process and Decode
        input_features = processor(np.array(indata, dtype="float64"), sampling_rate=sample_rate, return_tensors="pt").input_features 
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].lower()

        predicted_transcript += transcription
        print(transcription)
print("Complete transcription: \n" + predicted_transcript)

 in the course of a december tour in york
 i rode for a long distance.
 in one of the public coaches on the day.
 day preceding christmas.
 you
Complete transcription: 
 in the course of a december tour in york i rode for a long distance. in one of the public coaches on the day. day preceding christmas. you


In [None]:
#@title Evaluation

import jiwer

# Evaluate for each error type
wer_error = jiwer.wer(ground_truth, predicted_transcript)
mer_error = jiwer.mer(ground_truth, predicted_transcript)
wil_error = jiwer.wil(ground_truth, predicted_transcript)

print(f"Word Error Rate (WER): {wer_error}")          # It calculates the percentage of word-level errors between the two.
print(f"Match Error Rate (MER): {mer_error}")         # It considers both insertions (extra words) and deletions (missing words) in addition to substitutions
print(f"Word Information Lost (WIL): {wil_error}")    # It calculates the percentage of words in the reference transcription that were not present in the output transcription.

Word Error Rate (WER): 0.19230769230769232
Match Error Rate (MER): 0.17857142857142858
Word Information Lost (WIL): 0.2733516483516484


In [None]:
#@title Question Answering

from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

# Load question-answering model from HuggingFace
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

# Question
question = "when is the story taking place?"

# Pass in question along with context
inputs = tokenizer(question, predicted_transcript, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

# Identify start and end index (for answer)
answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits)

# Get indexes and decode
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'day preceding christmas'