# Install Libraries

In [1]:
pip install -q gdown datasets transformers accelerate soundfile librosa evaluate jiwer tensorboard gradio chardet

Note: you may need to restart the kernel to use updated packages.


## Download files from Gdrive and Unzip

In [None]:
!gdown --id 1FxcRZIYCYIQUrfZbTYBo3y0-V3up7_z0
!gdown --id 1PGplQ-PZ0C-J3B8sZI41jN2Lfewqt2Xt

In [None]:
!unzip /kaggle/working/Training-20231114T063007Z-001.zip
!unzip Transcripts-20231215T113721Z-001.zip

### Check for GPU availability

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Import Libraries

In [2]:
import os
import numpy as np
import pandas as pd
import chardet
import datasets
import librosa
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor,pipeline
import gradio as gr

2024-02-26 08:17:37.637954: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-26 08:17:37.638052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-26 08:17:37.767012: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Hugging_face Login

In [34]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Import processor, tokenizer and feature extractor

In [12]:
processor = WhisperProcessor.from_pretrained("openai/whisper-medium",  task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium",  task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium",  task="transcribe")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

### Testing the tokenizer

In [5]:
input_str = "வயது 60 சக்கரைக்கு செக்கப் பண்ண வந்திருக்கேன்பா டாக்டர் எப்பவருவாரு அது என்னா செய்யனும் முதல்ல டெஸ்ட் எடுக்கனுமா? எடுத்துச்சாட்டும் எவ்வளவு நேரம் கழிச்சு வரனும்"

labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

## Prepare the training data

In [None]:
path_1 = "/content/gdrive/My Drive/VIT/Tamil ASR/Train/Audio"
path_2 = "/content/gdrive/My Drive/VIT/Tamil ASR/Train/Transcripts"

list1 = sorted(os.listdir(path_1))
list2 = sorted(os.listdir(path_2))

audio_dataset = []

for file1,file2 in zip(list1,list2):
  audio_dict = {}

  audio_path = os.path.join(path_1,file1)
  transcript_path = os.path.join(path_2,file2)

  with open(transcript_path, 'rb') as file:
    file_content = file.read()
    encoding = chardet.detect(file_content)['encoding']

    if encoding == "utf-8":
      transcript = file_content.decode("utf-8")
    else:
      transcript = file_content.decode("UTF-16")

  audio_data, sampling_rate = librosa.load(audio_path)

  # Resample the audio data to 16kHz
  if sampling_rate != 16000:
    audio_data = librosa.resample(audio_data, orig_sr = sampling_rate, target_sr = 16000)

  input_features = feature_extractor(audio_data, sampling_rate=16000).input_features[0]

  audio_dict["transcript"] = transcript
  audio_dict["audio_data"] = audio_data
  audio_dict["input_features"] = input_features
  audio_dict["labels"] = tokenizer(transcript).input_ids

  audio_dataset.append(audio_dict)

In [None]:
print(audio_dataset[0])

### Save the training data

In [None]:
#Save the data as a pickle file
import pickle
file_path = "/content/gdrive/My Drive/VIT/Tamil ASR/Train/data2.pkl"

# Save the list to a file using pickle
with open(file_path, 'wb') as file:
    pickle.dump(audio_dataset, file)

## Load the training data

In [13]:
import pickle
file_path1 = "/kaggle/input/openslr/male_data.pkl"
file_path2 = "/kaggle/input/openslr/female_data.pkl"
file_path3 = "/kaggle/input/openslr-malayalam/male_data.pkl"
file_path4 = "/kaggle/input/openslr-malayalam/female_data.pkl"

with open(file_path1, 'rb') as file1:
    audio_dataset1 = pickle.load(file1)
    
with open(file_path2, 'rb') as file2:
    audio_dataset2 = pickle.load(file2)

with open(file_path3, 'rb') as file3:
    audio_dataset3 = pickle.load(file3)
    
with open(file_path4, 'rb') as file4:
    audio_dataset4 = pickle.load(file4)

In [16]:
print(len(audio_dataset1),len(audio_dataset2),len(audio_dataset3),len(audio_dataset4))

test_1 = audio_dataset1[1600:]
test_2 = audio_dataset2[2000:]

eval_1 = audio_dataset1[1200:1600]
eval_2 = audio_dataset2[1400:2000]

train_1 = audio_dataset1[:1200]
train_2 = audio_dataset2[:1400]

test_3 = audio_dataset3[1700:]
test_4 = audio_dataset4[1700:]

eval_3 = audio_dataset3[1200:1700]
eval_4 = audio_dataset4[1200:1700]

train_3 = audio_dataset3[:1200]
train_4 = audio_dataset4[:1200]

print(len(test_1),len(test_2),len(train_1),len(train_2),len(eval_1),len(eval_2))
print(len(test_3),len(test_4),len(train_3),len(train_4),len(eval_3),len(eval_4))

audio_dataset = train_1 + train_3 + train_2 + train_4 + eval_1 + eval_3 + eval_2 + eval_4

print(len(audio_dataset))

1956 2335 2023 2103
356 335 1200 1400 400 600
323 403 1200 1200 500 500
7000


In [18]:
for i in range(len(audio_dataset)):
    if len(audio_dataset[i]['transcript'])==0:
#        audio_dataset.pop(i)
        print(i)

print(len(audio_dataset))

6994


## Data collator

In [28]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]

        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [29]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Computer Metrics (WER)

In [30]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

# Model

In [31]:
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

In [32]:
#model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="tamil", task="transcribe")
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(task="transcribe")
model.config.suppress_tokens = []

## Training Setup

In [37]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-medium-ta-ml",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=10,
    max_steps=100,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=2,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=audio_dataset[:5000],
    eval_dataset=audio_dataset[1197:2003],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

## Training

In [None]:
trainer.train()

processor.save_pretrained(training_args.output_dir)

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


In [20]:
kwargs = {
    "language": ["ta","ml"],
    "model_name": "./whisper-medium-ta-ml",
    "finetuned_from": "openai/whisper-medium",
    "tasks": "automatic-speech-recognition",
}

trainer.push_to_hub(**kwargs)

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


CommitInfo(commit_url='https://huggingface.co/sujith013/whisper-medium-ta-ml/commit/6c244eae7e6bf8e879fb34fbec67645998b143f5', commit_message='End of training', commit_description='', oid='6c244eae7e6bf8e879fb34fbec67645998b143f5', pr_url=None, pr_revision=None, pr_num=None)

# **Testing**

In [5]:
pipe = pipeline(model="sujith013/whisper-medium-ta-ml")

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

In [10]:
import string
import numpy as np
import pandas as pd
import os
import chardet

def remove_punctuation(input_string):
    translator = str.maketrans("", "", string.punctuation)

    input_string = str.replace(input_string,"."," ")
    result = input_string.translate(translator)
    result = ' '.join(result.splitlines())

    return result 

test_audio = test_1 + test_3 + test_2 + test_4

count=0
final_wer = 0
prs = []
trs = []

#os.makedirs("/kaggle/working/predictions")
count=1

for x in test_audio[461:]:
    print(count)
    count+=1
    
    pr = remove_punctuation(transcribe(x['audio_data'])).strip()
    tr = remove_punctuation(x['transcript']).strip()

    flag=0
    
    for x in pr:
        if ord(x)==8230 or ord(x)==65533 or ord(x)==8204 or ord(x)==160 or ord(x)==9:
            flag=1
    
    for x in tr:
        if ord(x)==8230 or ord(x)==65533 or ord(x)==8204 or ord(x)==160 or ord(x)==9:
            flag=1
            
#    if flag==1:
#        continue
    
    prs.append(pr)
    trs.append(tr)
    
    prs = list(tuple(prs))
    trs = list(tuple(trs))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [None]:
import string
import numpy as np
import pandas as pd
import os
import chardet

def remove_punctuation(input_string):
    translator = str.maketrans("", "", string.punctuation)

    input_string = str.replace(input_string,"."," ")
    result = input_string.translate(translator)
    result = ' '.join(result.splitlines())

    return result 

test_audio = os.listdir("/kaggle/input/test-dataset/Audio/Audio")
test_transcript = os.listdir("/kaggle/input/test-dataset/Transcripts/Transcripts")

count=0
final_wer = 0
prs = []
trs = []

#os.makedirs("/kaggle/working/predictions")
count=1

for x in test_audio:
    print(count)
    count+=1
    y = x[0:-3] + "txt"
    
    if y=="Audio - 48_01.txt":
        continue
    
    path1 = os.path.join("/kaggle/input/test-dataset/Audio/Audio",x)
    path2 = os.path.join("/kaggle/input/test-dataset/Transcripts/Transcripts",y)
    path3 = os.path.join("/kaggle/working/predictions/",y)
    
    pr=""
    tr=""
    
    pr = remove_punctuation(transcribe(path1)).strip()

    with open(path2,"rb") as file2:
        file_content = file2.read()
        encoding = chardet.detect(file_content)['encoding']

        if encoding == "utf-8":
            transcript = file_content.decode("utf-8")
        else:
            continue
            transcript = file_content.decode("UTF-16")
        
        tr = remove_punctuation(transcript).strip()

    flag=0
    
    for x in pr:
        if ord(x)==8230 or ord(x)==65533 or ord(x)==8204 or ord(x)==160 or ord(x)==9:
            flag=1
    
    for x in tr:
        if ord(x)==8230 or ord(x)==65533 or ord(x)==8204 or ord(x)==160 or ord(x)==9:
            flag=1
            
    if flag==1:
        continue
    
    prs.append(pr)
    trs.append(tr)
    
    prs = list(tuple(prs))
    trs = list(tuple(trs))

## Create the dataframe and export as excel

In [11]:
test_df = pd.DataFrame(list(zip(prs, trs)),columns =['predictions', 'transcripts'])
test_df.head()

#train_df = pd.DataFrame(list(zip(prs, trs)),columns =['train_predictions', 'train_transcripts'])
#train_df.head()

test_df.to_excel("/kaggle/working/test_data_asr_new.xlsx")
#train_df.to_excel("/kaggle/working/train_data_asr.xlsx")

# **Compute Metrics WER**

In [19]:
import evaluate
import pandas as pd

metric1 = evaluate.load("wer")
metric2 = evaluate.load("cer")

test_df = pd.read_excel("/kaggle/working/test_data_asr_new.xlsx")
test_df.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)
test_df.rename({"test_predictions":"predictions"}, axis="columns", inplace=True)
test_df.rename({"test_transcripts":"transcripts"}, axis="columns", inplace=True)
test_df.drop(["a"], axis=1, inplace=True)

#print(test_df.columns)

final_wer = 0
final_cer = 0

WER_list = []
CER_list = []

for i in range(test_df.shape[0]):
    tr = test_df['transcripts'][i]
    pr = test_df['predictions'][i]
    
    if i==470:
        continue

    wer = metric1.compute(references=[tr], predictions=[pr])
    final_wer += wer
    WER_list.append(wer)
    
    cer = metric2.compute(references=[tr], predictions=[pr])
    final_cer += cer
    CER_list.append(cer)
    
    print(f'{i+1} : {100*wer}')
    print(f'{i+1} : {100*cer}')
    print("")

print("WER : ",100*(final_wer/test_df.shape[0]))
print("CER : ",100*(final_cer/test_df.shape[0]))

1 : 33.33333333333333
1 : 4.081632653061225

2 : 25.0
2 : 4.2105263157894735

3 : 66.66666666666666
3 : 22.727272727272727

4 : 33.33333333333333
4 : 5.88235294117647

5 : 20.0
5 : 4.166666666666666

6 : 57.14285714285714
6 : 15.254237288135593

7 : 60.0
7 : 19.230769230769234

8 : 20.0
8 : 2.1739130434782608

9 : 0.0
9 : 0.0

10 : 20.0
10 : 7.6923076923076925

11 : 85.71428571428571
11 : 11.688311688311687

12 : 50.0
12 : 18.867924528301888

13 : 33.33333333333333
13 : 6.779661016949152

14 : 16.666666666666664
14 : 2.631578947368421

15 : 16.666666666666664
15 : 8.333333333333332

16 : 14.285714285714285
16 : 3.389830508474576

17 : 33.33333333333333
17 : 4.838709677419355

18 : 77.77777777777779
18 : 16.455696202531644

19 : 0.0
19 : 0.0

20 : 33.33333333333333
20 : 7.4074074074074066

21 : 80.0
21 : 10.416666666666668

22 : 25.0
22 : 5.405405405405405

23 : 12.5
23 : 1.3513513513513513

24 : 25.0
24 : 2.2222222222222223

25 : 42.857142857142854
25 : 11.864406779661017

26 : 41.6666

In [22]:
test_df = test_df.drop(470)
test_df['WER'] = WER_list
test_df['CER'] = CER_list

test_df.head()
test_df.to_excel("/kaggle/working/test_data_asr_new.xlsx")

## Few sample Outputs

In [38]:
metric = evaluate.load("wer")
count = 0

for i in range(test_df.shape[0]):
    tr = test_df['transcripts'][i]
    pr = test_df['predictions'][i]

    wer = metric.compute(references=[tr], predictions=[pr])
    
    if (100*wer)<=50:
        print(f'Transcript : {tr}')
        print("")
        print(f'Prediction : {pr}')
        print("-------------------------")
        count+=1

#print(count)

Transcript : எது ஜோடிச்ச கேஸ்ங்கிரிய ஏன் நெலத்த வெல பேசுனிய குடுக்கல

Prediction : எது ஜோடிச்ச கியாச்சுன்ரியை ஏன் நல்லத்த விலை பேசுன்னு குடுக்கல
-------------------------
Transcript : ஏண்டா தனி தனியா போறீங்க வாங்க ஒன்னா போவோம்

Prediction : ஏன்டா தனி தனியா போறீங்க வாங்க ஒன்னா போவும்
-------------------------
Transcript : சித்தப்பா போஸ் குடு இரு சித்தப்பா வரேன்

Prediction : சித்தப்பா போஸ் கொடு இரு சித்தப்பா வரேன்
-------------------------
Transcript : சொல்ல எனக்கு சங்கட்டம்மா இருக்கு

Prediction : சொல்ல எனக்கு சங்க்கட்டமா இருக்கும்
-------------------------
Transcript : டாக்டர் எங்க இருக்காங்க எப்போ வருவாங்க இல்ல நீங்களே ஊசி போட்டு விடுவீங்களா இல்ல மெடிக்கல் ஏதும் பக்கத்துல இருக்கா எனக்கு சுகர் பிரஷர் எல்லாமே டெஸ்ட் பண்ணனும் எனக்கு ஒரு மாதிரியா இருக்கு

Prediction : டாக்டர் எங்க இருக்காங்க எப்ப வருவாங்க இல்ல நீங்களே ஊசி போட்டுருவிங்களா இல்ல மெடிக்கல் எங்க பக்கத்துல இருக்கா இன்னும் சுகர் இப்ப சரியல்  டிரஸ்ட் பண்ணனும் எனக்கு ஒரு மாதிரியா இருக்கு
-------------------------
Transcript : பயல

# **Realtime Testing**

In [None]:
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Whisper Small Tamil",
    description="Realtime testing of speech recognition in Tamil using a fine-tuned Whisper small model.",
)

iface.launch()