# Install Libraries

In [1]:
pip install -q gdown datasets transformers accelerate soundfile librosa evaluate jiwer tensorboard gradio chardet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 11.0.0 which is incompatible.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.3 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.11.0 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.11.0 which is incompatible.
dask-cudf 23.8.0 requires dask==2023.7

## Download files from Gdrive and Unzip

In [None]:
!gdown --id 1FxcRZIYCYIQUrfZbTYBo3y0-V3up7_z0
!gdown --id 1PGplQ-PZ0C-J3B8sZI41jN2Lfewqt2Xt

In [None]:
!unzip /kaggle/working/Training-20231114T063007Z-001.zip
!unzip Transcripts-20231215T113721Z-001.zip

### Check for GPU availability

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Import Libraries

In [2]:
import os
import numpy as np
import pandas as pd
import chardet
import datasets
import librosa
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor,pipeline
import gradio as gr



## Hugging_face Login

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Import processor, tokenizer and feature extractor

In [4]:
processor = WhisperProcessor.from_pretrained("sujith013/whisper-small-tamil", language="Tamil", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("sujith013/whisper-small-tamil", language="Tamil", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("sujith013/whisper-small-tamil", language="Tamil", task="transcribe")

Downloading (…)rocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Testing the tokenizer

In [5]:
input_str = "வயது 60 சக்கரைக்கு செக்கப் பண்ண வந்திருக்கேன்பா டாக்டர் எப்பவருவாரு அது என்னா செய்யனும் முதல்ல டெஸ்ட் எடுக்கனுமா? எடுத்துச்சாட்டும் எவ்வளவு நேரம் கழிச்சு வரனும்"

labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

# Dataset preparation

## Prepare the training data

In [None]:
path_1 = "/content/gdrive/My Drive/VIT/Tamil ASR/Train/Audio"
path_2 = "/content/gdrive/My Drive/VIT/Tamil ASR/Train/Transcripts"

list1 = sorted(os.listdir(path_1))
list2 = sorted(os.listdir(path_2))

audio_dataset = []

for file1,file2 in zip(list1,list2):
  audio_dict = {}

  audio_path = os.path.join(path_1,file1)
  transcript_path = os.path.join(path_2,file2)

  with open(transcript_path, 'rb') as file:
    file_content = file.read()
    encoding = chardet.detect(file_content)['encoding']

    if encoding == "utf-8":
      transcript = file_content.decode("utf-8")
    else:
      transcript = file_content.decode("UTF-16")

  audio_data, sampling_rate = librosa.load(audio_path)

  # Resample the audio data to 16kHz
  if sampling_rate != 16000:
    audio_data = librosa.resample(audio_data, orig_sr = sampling_rate, target_sr = 16000)

  input_features = feature_extractor(audio_data, sampling_rate=16000).input_features[0]

  audio_dict["transcript"] = transcript
  audio_dict["audio_data"] = audio_data
  audio_dict["input_features"] = input_features
  audio_dict["labels"] = tokenizer(transcript).input_ids

  audio_dataset.append(audio_dict)

In [None]:
print(audio_dataset[0])

### Save the training data

In [None]:
#Save the data as a pickle file
import pickle
file_path = "/content/gdrive/My Drive/VIT/Tamil ASR/Train/data2.pkl"

# Save the list to a file using pickle
with open(file_path, 'wb') as file:
    pickle.dump(audio_dataset, file)

## Load the training data

In [7]:
#Load the file for training data
import pickle
file_path = "/kaggle/working/data2.pkl"

# Save the list to a file using pickle
with open(file_path, 'rb') as file:
    audio_dataset = pickle.load(file)

## Data collator

In [8]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors

        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [9]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Computer Metrics (WER)

In [10]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

# Model

In [11]:
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained("sujith013/whisper-small-tamil")

Downloading config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [12]:
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="tamil", task="transcribe")
model.config.suppress_tokens = []

## Training Setup

In [13]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-tamil1",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=10,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=audio_dataset,
    eval_dataset=audio_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

## Training

In [15]:
trainer.train()

processor.save_pretrained(training_args.output_dir)

kwargs = {
    "language": "ta",
    "model_name": "Whisper-Small-Tamil1",
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "Tamil-ASR",
}

trainer.push_to_hub(**kwargs)

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
1000,0.0078,0.000712,8.925991


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=1000, training_loss=0.00776361083984375, metrics={'train_runtime': 7127.7448, 'train_samples_per_second': 2.245, 'train_steps_per_second': 0.14, 'total_flos': 4.59774259789824e+18, 'train_loss': 0.00776361083984375, 'epoch': 17.54})

# **Testing**

In [4]:
pipe = pipeline(model="sujith013/whisper-small-tamil1")

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

Downloading config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)rocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

In [None]:
import string
import numpy as np
import pandas as pd
import os
import chardet

def remove_punctuation(input_string):
    translator = str.maketrans("", "", string.punctuation)

    input_string = str.replace(input_string,"."," ")
    result = input_string.translate(translator)
    result = ' '.join(result.splitlines())

    return result 

test_audio = os.listdir("/kaggle/input/test-dataset/Audio/Audio")
test_transcript = os.listdir("/kaggle/input/test-dataset/Transcripts/Transcripts")

count=0
final_wer = 0
prs = []
trs = []

#os.makedirs("/kaggle/working/predictions")
count=1

for x in test_audio:
    print(count)
    count+=1
    y = x[0:-3] + "txt"
    
    if y=="Audio - 48_01.txt":
        continue
    
    path1 = os.path.join("/kaggle/input/test-dataset/Audio/Audio",x)
    path2 = os.path.join("/kaggle/input/test-dataset/Transcripts/Transcripts",y)
    path3 = os.path.join("/kaggle/working/predictions/",y)
    
    pr=""
    tr=""
    
    pr = remove_punctuation(transcribe(path1)).strip()

    with open(path2,"rb") as file2:
        file_content = file2.read()
        encoding = chardet.detect(file_content)['encoding']

        if encoding == "utf-8":
            transcript = file_content.decode("utf-8")
        else:
            continue
            transcript = file_content.decode("UTF-16")
        
        tr = remove_punctuation(transcript).strip()

    flag=0
    
    for x in pr:
        if ord(x)==8230 or ord(x)==65533 or ord(x)==8204 or ord(x)==160 or ord(x)==9:
            flag=1
    
    for x in tr:
        if ord(x)==8230 or ord(x)==65533 or ord(x)==8204 or ord(x)==160 or ord(x)==9:
            flag=1
            
    if flag==1:
        continue
    
    prs.append(pr)
    trs.append(tr)
    
    prs = list(tuple(prs))
    trs = list(tuple(trs))

## Create the dataframe and export as excel

In [9]:
test_df = pd.DataFrame(list(zip(prs, trs)),columns =['predictions', 'transcripts'])
test_df.head()

#train_df = pd.DataFrame(list(zip(prs, trs)),columns =['train_predictions', 'train_transcripts'])
#train_df.head()

test_df.to_excel("/kaggle/working/test_data_asr_new.xlsx")
#train_df.to_excel("/kaggle/working/train_data_asr.xlsx")

# **Compute Metrics WER**

In [24]:
import evaluate
import pandas as pd
metric = evaluate.load("cer")

test_df = pd.read_excel("/kaggle/input/test-dataset/test_data_asr_new.xlsx")
test_df.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)
test_df.rename({"test_predictions":"predictions"}, axis="columns", inplace=True)
test_df.rename({"test_transcripts":"transcripts"}, axis="columns", inplace=True)
test_df.drop(["a"], axis=1, inplace=True)

print(test_df.columns)

final_wer = 0

for i in range(test_df.shape[0]):
    tr = test_df['transcripts'][i]
    pr = test_df['predictions'][i]

    wer = metric.compute(references=[tr], predictions=[pr])
    final_wer += wer
    
    print(f'{i+1} : {100*wer}')

print("CER : ",100*(final_wer/test_df.shape[0]))

Index(['predictions', 'transcripts'], dtype='object')
1 : 58.139534883720934
2 : 40.0
3 : 29.09090909090909
4 : 64.04494382022472
5 : 65.74074074074075
6 : 16.0
7 : 4.761904761904762
8 : 122.58064516129032
9 : 44.44444444444444
10 : 58.515283842794766
11 : 91.54929577464789
12 : 46.54545454545455
13 : 2.564102564102564
14 : 30.32258064516129
15 : 80.97345132743364
16 : 86.1878453038674
17 : 20.245398773006134
18 : 37.5
19 : 18.75
20 : 9.375
21 : 53.813559322033896
22 : 25.71428571428571
23 : 38.46153846153847
24 : 16.666666666666664
25 : 29.585798816568047
26 : 16.216216216216218
27 : 54.285714285714285
28 : 72.53521126760563
29 : 26.937269372693727
30 : 54.400000000000006
31 : 41.29032258064516
32 : 38.81856540084388
33 : 35.714285714285715
34 : 15.254237288135593
35 : 21.385542168674696
36 : 15.476190476190476
37 : 30.76923076923077
38 : 33.33333333333333
39 : 55.64516129032258
40 : 27.86885245901639
41 : 59.8705501618123
42 : 45.608108108108105
43 : 28.187919463087248
44 : 69.548872

## Few sample Outputs

In [38]:
metric = evaluate.load("wer")
count = 0

for i in range(test_df.shape[0]):
    tr = test_df['transcripts'][i]
    pr = test_df['predictions'][i]

    wer = metric.compute(references=[tr], predictions=[pr])
    
    if (100*wer)<=50:
        print(f'Transcript : {tr}')
        print("")
        print(f'Prediction : {pr}')
        print("-------------------------")
        count+=1

#print(count)

Transcript : எது ஜோடிச்ச கேஸ்ங்கிரிய ஏன் நெலத்த வெல பேசுனிய குடுக்கல

Prediction : எது ஜோடிச்ச கியாச்சுன்ரியை ஏன் நல்லத்த விலை பேசுன்னு குடுக்கல
-------------------------
Transcript : ஏண்டா தனி தனியா போறீங்க வாங்க ஒன்னா போவோம்

Prediction : ஏன்டா தனி தனியா போறீங்க வாங்க ஒன்னா போவும்
-------------------------
Transcript : சித்தப்பா போஸ் குடு இரு சித்தப்பா வரேன்

Prediction : சித்தப்பா போஸ் கொடு இரு சித்தப்பா வரேன்
-------------------------
Transcript : சொல்ல எனக்கு சங்கட்டம்மா இருக்கு

Prediction : சொல்ல எனக்கு சங்க்கட்டமா இருக்கும்
-------------------------
Transcript : டாக்டர் எங்க இருக்காங்க எப்போ வருவாங்க இல்ல நீங்களே ஊசி போட்டு விடுவீங்களா இல்ல மெடிக்கல் ஏதும் பக்கத்துல இருக்கா எனக்கு சுகர் பிரஷர் எல்லாமே டெஸ்ட் பண்ணனும் எனக்கு ஒரு மாதிரியா இருக்கு

Prediction : டாக்டர் எங்க இருக்காங்க எப்ப வருவாங்க இல்ல நீங்களே ஊசி போட்டுருவிங்களா இல்ல மெடிக்கல் எங்க பக்கத்துல இருக்கா இன்னும் சுகர் இப்ப சரியல்  டிரஸ்ட் பண்ணனும் எனக்கு ஒரு மாதிரியா இருக்கு
-------------------------
Transcript : பயல

# **Realtime Testing**

In [2]:
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Whisper Small Tamil",
    description="Realtime testing of speech recognition in Tamil using a fine-tuned Whisper small model.",
)

iface.launch()

NameError: name 'gr' is not defined