# Automatic Speech Recognition using Whisper

### Importing necessary libraries

In [1]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
import pandas as pd
import whisper
import torchaudio
from tqdm.notebook import tqdm,trange
import wavfile
import io
import jiwer

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### Data Preprocessing

In [None]:
test_path = "/Users/vannshjani/Downloads/kathbath/hindi/test/"
test_known_path = "/Users/vannshjani/Downloads/kathbath/hindi/test_known/"
folder_test = os.listdir(test_path)
folder_test_known = os.listdir(test_known_path)
print(folder_test), print(folder_test_known)

In [None]:
audio_files = folder_test[2]
transcript_files = folder_test[3]


In [None]:
transcript_dict = {}
with open(os.path.join(test_path,'transcription.txt'), 'r') as file:
    transcript = file.readlines()
    for i in range(len(transcript)):
        audio_file_name,text = transcript[i].split(".")
        text = text.split("\t")[1]
        text = text.split("\n")[0]
        transcript_dict[audio_file_name] = text


In [None]:
len(transcript_dict)

In [None]:
audio_folder_path = os.path.join(test_path,audio_files)
audio_folder = os.listdir(audio_folder_path)
print(audio_folder)

In [None]:
all_auido_files = []
for sub_fold in audio_folder:
    if sub_fold != ".DS_Store":
        audio_file_path = os.path.join(audio_folder_path,sub_fold)
        # print(audio_folder_path)
        sub_folder_files = os.listdir(audio_file_path)
        all_auido_files.extend(sub_folder_files)

print(len(all_auido_files))

In [None]:
transcript = []
for audio_file in all_auido_files:
    audio_file_name = audio_file.split(".")[0]
    text = transcript_dict[audio_file_name]
    transcript.append(text)

len(transcript)

In [None]:
# creating a dataframe
df = pd.DataFrame(list(zip(all_auido_files, transcript)), columns=['audio', 'transcript'])
df.head()

In [None]:
df.to_csv("test_asr.csv", index=False)

In [None]:
class KathbathDataset(Dataset):
    def __init__(self, df, audio_folder_path, sr=16000):
        self.df = df
        self.audio_folder_path = audio_folder_path
        self.sr = sr

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_file = self.df.iloc[idx, 0]
        transcript = self.df.iloc[idx, 1]
        # Set sample rate to 16k
        # audio = wavfile.read(io.BytesIO(audio_file))[1]
        # audio = torch.from_numpy(audio).float()
        # mel = whisper.audio.log_mel_spectrogram(audio)

        return audio_file, transcript

In [None]:
audio_file_path_new = "/Users/vannshjani/Downloads/kathbath/hindi/test/audio_files/"
print(audio_file_path_new)

In [None]:
dataset = KathbathDataset(df, audio_file_path_new)
dataloader = DataLoader(dataset, batch_size=16)

In [None]:
dataset[0]

In [None]:
model = whisper.load_model("medium")
print(model.is_multilingual)

In [None]:
references = []
transcriptions = []
translations = []

for audio, text in tqdm(dataset):
    path = os.path.join(audio_file_path_new,audio)
    transcription = model.transcribe(path, fp16=False)["text"]
    # translation = model.transcribe(path, **translate_options)["text"]
    
    transcriptions.append(transcription)
    # translations.append(translation)
    references.append(text)

In [None]:
data = pd.DataFrame(dict(reference=references, transcription=transcriptions))
data

In [None]:
data.to_csv("test_asr_results.csv", index=False)

In [4]:
results = pd.read_csv("test_asr_results.csv")
results.head()

Unnamed: 0,reference,transcription
0,हालांकि प्यार मोहब्बत के मामले में आपको जल्दबा...,हाला के प्यार महवबत के मामले में आपको जल्दवाज...
1,जिसके चलते बाजार भी गिफ्ट सेंटरों से सजे हैं।,जिसके चलते पाजार भी गिफ्ट सेंटरों से सजे हैं।
2,हालांकि कारोबारियों के पास ऑनलाइन एडवांस टैक्स...,हाला कि कारोबारीयों के पास आउनलाइन एडवान्स टै...
3,उनका पालन करते हुए नाकों पर ट्रैफिक पुलिस अब म...,उनका पालन करते हुए नाकों पर ट्रैफिक पॉलेस अप ...
4,आपकी आर्थिक स्थिति मजबूत बनी रहेगी और आप निवेश...,आपकी आर्थिक इस्तिति मजबूत बनी रहे की और आप नि...


### Calculating Word Error Rate (WER)

In [5]:
ref_list = results['reference'].tolist()
hyp_list = results['transcription'].tolist()
len(ref_list), len(hyp_list)

(1929, 1929)

In [6]:
error = jiwer.wer(ref_list, hyp_list)
error

0.5181074508069865