# Install Libraries

In [1]:
pip install -q gdown datasets transformers accelerate soundfile librosa evaluate jiwer tensorboard gradio chardet

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import Libraries

In [2]:
import os
import numpy as np
import pandas as pd
import chardet
import datasets
import librosa
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor,pipeline
import gradio as gr

### Import processor, tokenizer and feature extractor

In [3]:
processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="Kannada", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="Kannada", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium", language="Kannada", task="transcribe")

## Prepare the training data

In [7]:
transcript = ""

with open("C:/D/VIT CHN/Internships/Tamil ASR/Data/OpenSLR - Kannada/line_index_male.tsv", 'rb') as file:
    file_content = file.read()
    encoding = chardet.detect(file_content)['encoding']

    if encoding == "utf-8":
      transcript = file_content.decode("utf-8")

z = transcript.split("\n")

print(len(z))

path_1 = "C:/D/VIT CHN/Internships/Tamil ASR/Data/OpenSLR - Kannada/Male"

list1 = sorted(os.listdir(path_1))
z = sorted(z)
z = z[1:]

print(list1)
print(z)

audio_dataset = []
i = 0

for file1 in list1:
  audio_dict = {}

  audio_path = os.path.join(path_1,file1)
  y = z[i].split("\t")
  
  if y[0]==file1[:-4]:
    print(i)
    transcript = y[1]
  else:
    i+=1
    continue

  audio_data, sampling_rate = librosa.load(audio_path)

  # Resample the audio data to 16kHz
  if sampling_rate != 16000:
    audio_data = librosa.resample(audio_data, orig_sr = sampling_rate, target_sr = 16000)

  input_features = feature_extractor(audio_data, sampling_rate=16000).input_features[0]

  audio_dict["transcript"] = transcript
  audio_dict["audio_data"] = audio_data
  audio_dict["input_features"] = input_features
  audio_dict["labels"] = tokenizer(transcript).input_ids

  audio_dataset.append(audio_dict)
  i+=1

2215
['knm_00180_00011382081.wav', 'knm_00180_00012816557.wav', 'knm_00180_00028931935.wav', 'knm_00180_00031068402.wav', 'knm_00180_00042410675.wav', 'knm_00180_00079643205.wav', 'knm_00180_00130644026.wav', 'knm_00180_00148823892.wav', 'knm_00180_00220532003.wav', 'knm_00180_00221372875.wav', 'knm_00180_00222916560.wav', 'knm_00180_00250075112.wav', 'knm_00180_00266045925.wav', 'knm_00180_00291521475.wav', 'knm_00180_00297370869.wav', 'knm_00180_00304524174.wav', 'knm_00180_00315697229.wav', 'knm_00180_00378960913.wav', 'knm_00180_00393491840.wav', 'knm_00180_00398565905.wav', 'knm_00180_00458223984.wav', 'knm_00180_00461290774.wav', 'knm_00180_00606527548.wav', 'knm_00180_00606909868.wav', 'knm_00180_00636639047.wav', 'knm_00180_00638194660.wav', 'knm_00180_00709659375.wav', 'knm_00180_00726312040.wav', 'knm_00180_00742976223.wav', 'knm_00180_00781063792.wav', 'knm_00180_00783813999.wav', 'knm_00180_00800270463.wav', 'knm_00180_00811702365.wav', 'knm_00180_00820221766.wav', 'knm_001

In [8]:
print(audio_dataset[0])

{'transcript': 'ಅವನು ಬರೆದಿರುವುದು ಈಗಿನ ತಿಳಿವಿಗೆ ಹೆಚ್ಚು ಕಡಿಮೆ ಸರಿಹೋಗುತ್ತದೆ.', 'audio_data': array([-9.8019635e-05, -1.6307735e-04, -1.4788107e-04, ...,
        1.7609140e-05,  1.1843144e-05,  0.0000000e+00], dtype=float32), 'input_features': array([[-0.25118935, -0.7671138 , -0.5223918 , ..., -0.95388377,
        -0.95388377, -0.95388377],
       [-0.40205812, -0.40219152, -0.35259092, ..., -0.95388377,
        -0.95388377, -0.95388377],
       [-0.6339196 , -0.4353714 , -0.3769648 , ..., -0.95388377,
        -0.95388377, -0.95388377],
       ...,
       [-0.95388377, -0.95388377, -0.95388377, ..., -0.95388377,
        -0.95388377, -0.95388377],
       [-0.95388377, -0.95388377, -0.95388377, ..., -0.95388377,
        -0.95388377, -0.95388377],
       [-0.95388377, -0.95388377, -0.95388377, ..., -0.95388377,
        -0.95388377, -0.95388377]], dtype=float32), 'labels': [50258, 50306, 50359, 50363, 11891, 227, 11891, 113, 11891, 101, 24998, 223, 34725, 105, 11891, 108, 24998, 228, 11891, 9

### Save the training data

In [9]:
#Save the data as a pickle file
import pickle
file_path = "C:/D/VIT CHN/Internships/Tamil ASR/Data/OpenSLR - Kannada/male_data.pkl"

# Save the list to a file using pickle
with open(file_path, 'wb') as file:
    pickle.dump(audio_dataset, file)

## Load the training data

In [9]:
#Load the file for training data
import pickle
file_path = "D:/SUJITH/data2.pkl"

# Save the list to a file using pickle
with open(file_path, 'rb') as file:
    audio_dataset = pickle.load(file)

## Data collator

In [10]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors

        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [11]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Computer Metrics (WER)

In [12]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Downloading builder script: 100%|██████████| 4.49k/4.49k [00:00<?, ?B/s]


# Model

In [13]:
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

config.json: 100%|██████████| 1.99k/1.99k [00:00<00:00, 14.5MB/s]
model.safetensors: 100%|██████████| 6.17G/6.17G [01:48<00:00, 56.6MB/s]
generation_config.json: 100%|██████████| 4.29k/4.29k [00:00<00:00, 4.11MB/s]


In [14]:
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="tamil", task="transcribe")
model.config.suppress_tokens = []

## Training Setup

In [17]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper_large_vulnerable_data",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=10,
    max_steps=100,
    gradient_checkpointing=True,
    fp16=False,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=50,
    logging_steps=50,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=audio_dataset[0:700],
    eval_dataset=audio_dataset[700:],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [18]:
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


## Training

In [15]:
trainer.train()

processor.save_pretrained(training_args.output_dir)

kwargs = {
    "language": "ta",
    "model_name": "Whisper-Small-Tamil1",
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "Tamil-ASR",
}

trainer.push_to_hub(**kwargs)

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
1000,0.0078,0.000712,8.925991


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=1000, training_loss=0.00776361083984375, metrics={'train_runtime': 7127.7448, 'train_samples_per_second': 2.245, 'train_steps_per_second': 0.14, 'total_flos': 4.59774259789824e+18, 'train_loss': 0.00776361083984375, 'epoch': 17.54})

# **Testing**

In [3]:
pipe = pipeline(model="sujith013/whisper-medium-indic")

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

In [None]:
import string
import numpy as np
import pandas as pd
import os
import chardet

def remove_punctuation(input_string):
    translator = str.maketrans("", "", string.punctuation)
    input_string = str.replace(input_string,"."," ")
    result = input_string.translate(translator)
    result = ' '.join(result.splitlines())
    return result 

transcript = ""

with open("C:/D/VIT CHN/Internships/Tamil ASR/Data/OpenSLR - Kannada/line_index_male.tsv", 'rb') as file:
    file_content = file.read()
    encoding = chardet.detect(file_content)['encoding']

    if encoding == "utf-8":
      transcript = file_content.decode("utf-8")

z = transcript.split("\n")

path_1 = "C:/D/VIT CHN/Internships/Tamil ASR/Data/OpenSLR - Kannada/Male"
list1 = sorted(os.listdir(path_1))
z = sorted(z)
z = z[1:]

prs = []
trs = []

print(list1)
print(z)

i = 1800

for file1 in list1[1800:]:
  print(i)
  i+=1

  audio_path = os.path.join(path_1,file1)
  y = z[i].split("\t")

  if y[0]==file1[:-4]:
    transcript = y[1]
  else:
    continue
  
  pr = transcribe(audio_path)
  pr = remove_punctuation(pr).strip()
  tr = remove_punctuation(transcript).strip()

  prs.append(pr)
  trs.append(tr)

## Create the dataframe and export as excel

In [9]:
test_df = pd.DataFrame(list(zip(prs, trs)),columns =['predictions', 'transcripts'])
test_df.head()

test_df.to_excel("C:/D/VIT CHN/Internships/Tamil ASR/Data/test_data_kannada_male.xlsx")

# **Compute Metrics WER**

In [24]:
import evaluate
import pandas as pd
metric = evaluate.load("cer")

test_df = pd.read_excel("/kaggle/input/test-dataset/test_data_asr_new.xlsx")
test_df.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)
test_df.rename({"test_predictions":"predictions"}, axis="columns", inplace=True)
test_df.rename({"test_transcripts":"transcripts"}, axis="columns", inplace=True)
test_df.drop(["a"], axis=1, inplace=True)

print(test_df.columns)

final_wer = 0

for i in range(test_df.shape[0]):
    tr = test_df['transcripts'][i]
    pr = test_df['predictions'][i]

    wer = metric.compute(references=[tr], predictions=[pr])
    final_wer += wer
    
    print(f'{i+1} : {100*wer}')

print("CER : ",100*(final_wer/test_df.shape[0]))

Index(['predictions', 'transcripts'], dtype='object')
1 : 58.139534883720934
2 : 40.0
3 : 29.09090909090909
4 : 64.04494382022472
5 : 65.74074074074075
6 : 16.0
7 : 4.761904761904762
8 : 122.58064516129032
9 : 44.44444444444444
10 : 58.515283842794766
11 : 91.54929577464789
12 : 46.54545454545455
13 : 2.564102564102564
14 : 30.32258064516129
15 : 80.97345132743364
16 : 86.1878453038674
17 : 20.245398773006134
18 : 37.5
19 : 18.75
20 : 9.375
21 : 53.813559322033896
22 : 25.71428571428571
23 : 38.46153846153847
24 : 16.666666666666664
25 : 29.585798816568047
26 : 16.216216216216218
27 : 54.285714285714285
28 : 72.53521126760563
29 : 26.937269372693727
30 : 54.400000000000006
31 : 41.29032258064516
32 : 38.81856540084388
33 : 35.714285714285715
34 : 15.254237288135593
35 : 21.385542168674696
36 : 15.476190476190476
37 : 30.76923076923077
38 : 33.33333333333333
39 : 55.64516129032258
40 : 27.86885245901639
41 : 59.8705501618123
42 : 45.608108108108105
43 : 28.187919463087248
44 : 69.548872

## Few sample Outputs

In [38]:
metric = evaluate.load("wer")
count = 0

for i in range(test_df.shape[0]):
    tr = test_df['transcripts'][i]
    pr = test_df['predictions'][i]

    wer = metric.compute(references=[tr], predictions=[pr])
    
    if (100*wer)<=50:
        print(f'Transcript : {tr}')
        print("")
        print(f'Prediction : {pr}')
        print("-------------------------")
        count+=1

#print(count)

Transcript : எது ஜோடிச்ச கேஸ்ங்கிரிய ஏன் நெலத்த வெல பேசுனிய குடுக்கல

Prediction : எது ஜோடிச்ச கியாச்சுன்ரியை ஏன் நல்லத்த விலை பேசுன்னு குடுக்கல
-------------------------
Transcript : ஏண்டா தனி தனியா போறீங்க வாங்க ஒன்னா போவோம்

Prediction : ஏன்டா தனி தனியா போறீங்க வாங்க ஒன்னா போவும்
-------------------------
Transcript : சித்தப்பா போஸ் குடு இரு சித்தப்பா வரேன்

Prediction : சித்தப்பா போஸ் கொடு இரு சித்தப்பா வரேன்
-------------------------
Transcript : சொல்ல எனக்கு சங்கட்டம்மா இருக்கு

Prediction : சொல்ல எனக்கு சங்க்கட்டமா இருக்கும்
-------------------------
Transcript : டாக்டர் எங்க இருக்காங்க எப்போ வருவாங்க இல்ல நீங்களே ஊசி போட்டு விடுவீங்களா இல்ல மெடிக்கல் ஏதும் பக்கத்துல இருக்கா எனக்கு சுகர் பிரஷர் எல்லாமே டெஸ்ட் பண்ணனும் எனக்கு ஒரு மாதிரியா இருக்கு

Prediction : டாக்டர் எங்க இருக்காங்க எப்ப வருவாங்க இல்ல நீங்களே ஊசி போட்டுருவிங்களா இல்ல மெடிக்கல் எங்க பக்கத்துல இருக்கா இன்னும் சுகர் இப்ப சரியல்  டிரஸ்ட் பண்ணனும் எனக்கு ஒரு மாதிரியா இருக்கு
-------------------------
Transcript : பயல

# **Realtime Testing**

In [2]:
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Whisper Small Tamil",
    description="Realtime testing of speech recognition in Tamil using a fine-tuned Whisper small model.",
)

iface.launch()

NameError: name 'gr' is not defined