# Evaluate Whisper model on a custom dataset for [English transcription, Non-English language detection and transcription and, Any to English translation]

https://github.com/openai/whisper

The dataset contains both speech and no-speech. 

# Install and import packages

In [1]:
!pip install pyloudnorm
!pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyloudnorm
  Downloading pyloudnorm-0.1.1-py3-none-any.whl (9.6 kB)
Installing collected packages: pyloudnorm
Successfully installed pyloudnorm-0.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
%matplotlib inline
import librosa.display
from IPython.display import Audio
import pandas as pd
import os
import glob

# import splitfolders
import skimage.io
import pydub

import librosa as lr


# Install Whisper

In [3]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-d60xso_o
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-d60xso_o
  Resolved https://github.com/openai/whisper.git to commit 5c1a8c10e762bf9c29fcf6b3e40f17bc8ab09864
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=4.19.0
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python==0.2.0
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import whisper

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define parameters and Load Whisper models

In [6]:
audioFilePath_NoFall = '/content/drive/My Drive/Whisper_Test/NoFall'

In [7]:
model_tiny = whisper.load_model("tiny") #Use Tiny whisper model

100%|██████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 106MiB/s]


In [8]:
model_medium = whisper.load_model("medium") #Use Medium whisper model

100%|██████████████████████████████████████| 1.42G/1.42G [00:10<00:00, 149MiB/s]


In [9]:
model_large = whisper.load_model("large") #Use Large whisper model

# Convert audio files from .wav to .mp3
Whisper model needs .mp3 format. The original dataset was all in .wav format

In [None]:
def Convert_wav_2_mp3(rootdir):

  for it in os.scandir(rootdir):
    if it.is_dir():
      category_dir = it.path
      print(category_dir)
    
      #Read contents in this directory 
      for file in os.listdir(category_dir):
        filename = os.fsencode(file)
        filename = filename.decode()
        #print(filename)
        if filename.endswith(".wav"):
          # Get an audio file to be processed from the directory
           audio_path = (os.path.join(category_dir,filename)) 
           #print(audio_path)

           mp3_file = os.path.splitext(filename)[0] + '.mp3' #name the file with .mp3 extension
           path_to_save = (os.path.join(category_dir,mp3_file))
           #print(path_to_save)

           sound = pydub.AudioSegment.from_wav(audio_path) #convert to .mp3
           sound.export(path_to_save, format="mp3")

In [None]:
#Convert_wav_2_mp3(audioFilePath_Fall)

In [None]:
#Convert_wav_2_mp3(audioFilePath_NoFall)

# **1. Evaluate the whisper model for English transcription**

Key function for transcription

In [None]:
def transcribe_english(audio_path, whisper_model):
  audio = audio_path
  #print(audio)

  options = {"fp16": False, "language": "en", "task": "transcribe"}
  result = whisper_model.transcribe(audio, **options)

  return result["text"]

*   Read the mp3 files
*   Transcribe using the Whisper model
*   Read the actual text labels for the .mp3 files




In [None]:
def ReadAudios_andTranscribe(rootdir, whisper_model):
  sample_count = 0
  transcript_all = []
  ActualText_all = []

  for it in os.scandir(rootdir):
    if it.is_dir():
      category_dir = it.path
      #print(category_dir)
 
      #Read contents in this directory 
      for file in os.listdir(category_dir):
        filename = os.fsencode(file)
        filename = filename.decode()
        filename_without_ext = os.path.splitext(filename)[0] 
        text_file =  (os.path.join(category_dir,filename_without_ext + '.txt'))
        if filename.endswith(".mp3"):
          # Get an audio file to be processed from the directory
           audio_path = (os.path.join(category_dir,filename)) 

           text_in_audio = transcribe_english(audio_path, whisper_model) #transcribe using the whisper model
           transcript_all.append(text_in_audio)

           with open(text_file) as f: #read the actual corresponding text in the audio file
             actual_text = f.readlines()
             ActualText_all.append(actual_text)

           sample_count = sample_count+1  

  return  transcript_all, ActualText_all


#1a. Evaluate the '**tiny**' whisper model for English transcription

In [None]:
transcripts_all_NoFall, Actual_text_all_NoFall = ReadAudios_andTranscribe(audioFilePath_NoFall, model_tiny)

In [None]:
print(len(transcripts_all_NoFall))

6


In [None]:
# Function to convert a list of lists to a single list

Actual_text_NoFall = []
for idx in range(0, len(Actual_text_all_NoFall)):
  text = Actual_text_all_NoFall[idx]

  if text:
    actual_text = text[0]
    Actual_text_NoFall.append(actual_text)
  else:
    actual_text = ''
    Actual_text_NoFall.append(actual_text)

## Calculate the Word Error Rate (WER)

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer
normalizer = EnglishTextNormalizer()

In [None]:
dataNoFall = pd.DataFrame(dict(whisper_prediction=transcripts_all_NoFall, Actual_Text=Actual_text_NoFall))
dataNoFall

Unnamed: 0,whisper_prediction,Actual_Text
0,,
1,,
2,,
3,"Hi, good morning. Hello. Good morning. Good m...",hi how are you hello good morning
4,"Hi, very good bed. Hey, very good bed.",hi today is a good day hey today is a good day
5,What are you doing?,bafou what are you doing here


In [None]:
dataNoFall["whisper_prediction_clean"] = [normalizer(text) for text in dataNoFall["whisper_prediction"]]
dataNoFall["Actual_Text_clean"] = [normalizer(text) for text in dataNoFall["Actual_Text"]]
dataNoFall

Unnamed: 0,whisper_prediction,Actual_Text,whisper_prediction_clean,Actual_Text_clean
0,,,,
1,,,,
2,,,,
3,"Hi, good morning. Hello. Good morning. Good m...",hi how are you hello good morning,hi good morning hello good morning good morning,hi how are you hello good morning
4,"Hi, very good bed. Hey, very good bed.",hi today is a good day hey today is a good day,hi very good bed hey very good bed,hi today is a good day hey today is a good day
5,What are you doing?,bafou what are you doing here,what are you doing,bafou what are you doing here


In [None]:
# Remove the rows where actual text is empty (corresponding to audio with No Speech contect in it)
idx_2_remove = (dataNoFall[dataNoFall['Actual_Text_clean']==''].index.values)
print(idx_2_remove)

[0 1 2]


In [None]:
dataNoFall.drop(idx_2_remove, axis=0, inplace=True)

In [None]:
# Calculate the WER

wer = jiwer.wer(list(dataNoFall["Actual_Text_clean"]), list(dataNoFall["whisper_prediction_clean"]))
print(f"WER: {wer * 100:.2f} %")

WER: 60.00 %


# 1b. Evaluate the '**medium**' model for English transciption

In [None]:
transcripts_all_medium, Actual_text_all_medium = ReadAudios_andTranscribe(audioFilePath_NoFall, model_medium)

## Calculate the Word Error Rate (WER)

In [None]:
dataNoFall = pd.DataFrame(dict(whisper_prediction=transcripts_all_medium, Actual_Text=Actual_text_NoFall))
dataNoFall

Unnamed: 0,whisper_prediction,Actual_Text
0,,
1,,
2,,
3,"Hi, good morning. Hello, good morning.",hi how are you hello good morning
4,"Hi, today is a good day. Hi, today is a good ...",hi today is a good day hey today is a good day
5,What are you doing?,bafou what are you doing here


In [None]:
dataNoFall["whisper_prediction_clean"] = [normalizer(text) for text in dataNoFall["whisper_prediction"]]
dataNoFall["Actual_Text_clean"] = [normalizer(text) for text in dataNoFall["Actual_Text"]]
dataNoFall

Unnamed: 0,whisper_prediction,Actual_Text,whisper_prediction_clean,Actual_Text_clean
0,,,,
1,,,,
2,,,,
3,"Hi, good morning. Hello, good morning.",hi how are you hello good morning,hi good morning hello good morning,hi how are you hello good morning
4,"Hi, today is a good day. Hi, today is a good ...",hi today is a good day hey today is a good day,hi today is a good day hi today is a good day,hi today is a good day hey today is a good day
5,What are you doing?,bafou what are you doing here,what are you doing,bafou what are you doing here


In [None]:
# Remove the rows where actual text is empty (corresponding to audio with No Speech contect in it)

idx_2_remove = (dataNoFall[dataNoFall['Actual_Text_clean']==''].index.values)
print(idx_2_remove)

[0 1 2]


In [None]:
dataNoFall.drop(idx_2_remove, axis=0, inplace=True)

In [None]:
# Calculate the WER

wer = jiwer.wer(list(dataNoFall["Actual_Text_clean"]), list(dataNoFall["whisper_prediction_clean"]))
print(f"WER: {wer * 100:.2f} %")

WER: 24.00 %


# **2 and 3. Evaluate the Whisper model for Non-English language detection and transcription**


*   Punjabi
*   Hindi


In [None]:
# Code to convert .wav to .mp3
audioPathPunjabi = '/content/drive/My Drive/Whisper_Test/punjabi.wav'
sound = pydub.AudioSegment.from_wav(audioPathPunjabi)
sound.export("/content/drive/My Drive/Whisper_Test/punjabi.mp3", format="mp3")

<_io.BufferedRandom name='/content/drive/My Drive/Whisper_Test/punjabi.mp3'>

In [9]:
audioPunjabi = whisper.load_audio("/content/drive/My Drive/Whisper_Test/punjabi.mp3")
audioPunjabiSlow = whisper.load_audio("/content/drive/My Drive/Whisper_Test/punjabiSlow.mp3")
audioPunjabiNormal = whisper.load_audio("/content/drive/My Drive/Whisper_Test/punjabiNormal.mp3")

audioHindi = whisper.load_audio("/content/drive/My Drive/Whisper_Test/hindi.mp3")

In [10]:
audioPunjabi = whisper.pad_or_trim(audioPunjabi)  # This is necessary. Otherwise, it throws an error, 'incorrect audio shape'
audioPunjabiSlow = whisper.pad_or_trim(audioPunjabiSlow)
audioPunjabiNormal = whisper.pad_or_trim(audioPunjabiNormal)

audioHindi = whisper.pad_or_trim(audioHindi) 


In [None]:
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audioPunjabi).to(model_large.device)

# detect the spoken language
_, probs = model_large.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model_large, mel, options)

# print the recognized text
print(result.text)

Detected language: pa
ਸਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ ਸਸ


In [None]:
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audioHindi).to(model_large.device)

# detect the spoken language
_, probs = model_large.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model_large, mel, options)

# print the recognized text
print(result.text)

Detected language: hi
उमीद है कि आपको ये विडियो पसंद आई।


In [None]:
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audioPunjabiSlow).to(model_large.device)

# detect the spoken language
_, probs = model_large.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model_large, mel, options)

# print the recognized text
print(result.text)

Detected language: hi
तुसी की में हो।


In [None]:
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audioPunjabiNormal).to(model_large.device)

# detect the spoken language
_, probs = model_large.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model_large, mel, options)

# print the recognized text
print(result.text)

Detected language: hi
तुसी की मे हो?


# **4. Evaluate the Whisper model for Any to English translation**



*   Punjabi to English
*   Hindi to English




In [None]:
options = {"fp16": False, "language": "pa", "task": "translate"}
result = model_large.transcribe(audioPunjabi, **options)
print(result)

{'text': ' I hope you liked this video.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 4.0, 'text': ' I hope you liked this video.', 'tokens': [50364, 286, 1454, 291, 4501, 341, 960, 13, 50564], 'temperature': 0.0, 'avg_logprob': -0.5375502586364747, 'compression_ratio': 0.7777777777777778, 'no_speech_prob': 0.005461953114718199}], 'language': 'pa'}


In [None]:
options = {"fp16": False, "language": "pa", "task": "translate"}
result = model_large.transcribe(audioPunjabiSlow, **options)
print(result)

{'text': ' Tusi kime ho?', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.54, 'text': ' Tusi kime ho?', 'tokens': [50364, 314, 33016, 350, 1312, 1106, 30, 50491], 'temperature': 0.0, 'avg_logprob': -0.5609087944030762, 'compression_ratio': 0.6190476190476191, 'no_speech_prob': 0.08426279574632645}], 'language': 'hi'}


In [None]:
options = {"fp16": False, "language": "pa", "task": "translate"}
result = model_large.transcribe(audioPunjabiNormal, **options)
print(result)

{'text': ' Tusi, kime ho?', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.0, 'text': ' Tusi, kime ho?', 'tokens': [50364, 314, 33016, 11, 350, 1312, 1106, 30, 50464], 'temperature': 0.0, 'avg_logprob': -0.5168148040771484, 'compression_ratio': 0.6363636363636364, 'no_speech_prob': 0.09985967725515366}], 'language': 'pa'}


In [None]:
options = {"fp16": False, "language": "hi", "task": "translate"}
result = model_large.transcribe(audioHindi, **options)
print(result)

{'text': ' I hope you liked this video.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 4.0, 'text': ' I hope you liked this video.', 'tokens': [50364, 286, 1454, 291, 4501, 341, 960, 13, 50564], 'temperature': 0.0, 'avg_logprob': -0.6076899528503418, 'compression_ratio': 0.7777777777777778, 'no_speech_prob': 0.06783635914325714}], 'language': 'hi'}
