In [1]:
%%capture
!pip install datasets==1.18.3
!pip install transformers==4.11.3
!pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install jiwer

In [2]:
%%capture
!apt install git-lfs

In [15]:
!pip install --upgrade transformers
# Import all modules
import os
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import json
import torchaudio
from datasets import Dataset, Audio
from datasets import ClassLabel
import random
import torch
import re


# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


from datasets import Dataset, load_metric, ClassLabel
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Trainer, TrainingArguments

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from IPython.display import display, HTML

# Path to your dataset folder
dataset_path = "/content/drive/My Drive/karya_braj_data/translation/281474976710694/"


def create_dataset_from_folder(folder_path):
    data = {"audio": [], "transcription": []}
    for filename in os.listdir(folder_path):
        if filename.endswith(".wav"):
            audio_path = os.path.join(folder_path, filename)
            json_filename = filename.replace(".wav", ".json")
            json_path = os.path.join(folder_path, json_filename)

            # Check if the JSON file exists before attempting to open
            if os.path.exists(json_path):
                with open(json_path, 'r') as f:
                    transcription_data = json.load(f)
                # Extract transcription from the correct key
                transcription = transcription_data.get('data', '')
                print(f"Loaded transcription for {filename}: {transcription}")
                data["audio"].append(audio_path)
                data["transcription"].append(transcription)
            else:
                print(f"Warning: JSON file not found for {filename}. Skipping this audio file.")

    return Dataset.from_dict(data)

dataset = create_dataset_from_folder(dataset_path)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))


# Split the dataset into  test and test sets
train_test_split=dataset.train_test_split(test_size=0.2)
train_data=train_test_split["train"]
test_data=train_test_split["test"]

print(" train dataset contents:")
print( train_data)

print("Test dataset contents:")
print(test_data)

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = random.sample(range(len(dataset)), num_examples)
    selected_data = {"transcription": []}
    for pick in picks:
        selected_data["transcription"].append(dataset[pick]["transcription"])
    df = pd.DataFrame(selected_data)
    display(HTML(df.to_html()))

# Example usage
show_random_elements( test_data, num_examples=10)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded transcription for 281474976717938.wav: वह रोज बाजार जाती है
Loaded transcription for 281474976717942.wav: जि बांस और रस्सी एै ।
Loaded transcription for 281474976717950.wav: सब लोग जोरि नाइ ।
Loaded transcription for 281474976717958.wav: तुम कल्लि बएै किताब दै दैंगे ।
Loaded transcription for 281474976717951.wav: राजीब टोटो सबन्ति लम्बौ एै ।
Loaded transcription for 281474976717947.wav: सुर्य पूरब मे उगता है।
Loaded transcription for 281474976717944.wav: बु उछलि -उछलि कि अपए घर पौंचैगौ ।
Loaded transcription for 281474976717934.wav: बे सोइ रए ।
Loaded transcription for 281474976717948.wav: रूबी सबन्ति तेज धाबक एै ।
Loaded transcription for 281474976717940.wav: बच्चा स्कूल जाइ रएैं ।
Loaded transcription for 281474976717941.wav: मजदूर नि एै घरु बनाइबे काजि 50 पत्तथरन की जरूरति एै ।
Loaded transcription for 281474976717924.wav: राधा की बजै ति बु मरि गऔ ।

Unnamed: 0,transcription
0,मुझे आम अच्छा लगता है।
1,हम रोज स्कूल जाते हैं |
2,कुतन्नै बच्चएै रुबाइ दऔ ।
3,चलौ जा ।
4,का तुम अबैई आइ सकत औ ।
5,राधा नि अपन्ने के लएैं एक साड़ी खरीदी ।
6,पेड़ पर बंदर है।
7,तुम कल्लि बएै किताब दै दैंगे ।
8,का तुम कल्लि जां आइ सकातौ ।
9,तुम अबई चामलन एै खाइ रए औ ।


In [None]:
pd.__version__

'2.0.3'

In [16]:
# Convert to a Pandas DataFrame
train_df = pd.DataFrame(train_data)

# Filter out rows where ' transcription' contains any English alphabet characters
train_df =  train_df[ train_df['transcription'].str.contains("[A-Za-z]", regex=True) == False]

# Display the filtered DataFrame
print( train_df)


                                                 audio  \
0    {'path': '/content/drive/My Drive/karya_braj_d...   
1    {'path': '/content/drive/My Drive/karya_braj_d...   
2    {'path': '/content/drive/My Drive/karya_braj_d...   
3    {'path': '/content/drive/My Drive/karya_braj_d...   
4    {'path': '/content/drive/My Drive/karya_braj_d...   
..                                                 ...   
592  {'path': '/content/drive/My Drive/karya_braj_d...   
593  {'path': '/content/drive/My Drive/karya_braj_d...   
594  {'path': '/content/drive/My Drive/karya_braj_d...   
595  {'path': '/content/drive/My Drive/karya_braj_d...   
596  {'path': '/content/drive/My Drive/karya_braj_d...   

                                   transcription  
0                              बु कल्ल नगर गयो ।  
1                          बे स्कूल जाइ रए एैं ।  
2                                         जाइये!  
3                                  बु घर मेे औ ।  
4                  मैं जां तुमैं देखिबे आयौ ऊं ।

In [17]:
# Convert the Hugging Face Dataset to a Pandas DataFrame
test_df = pd.DataFrame( test_data)

# Filter out rows where ' transcription' contains any English alphabet characters
test_df =  test_df[ test_df['transcription'].str.contains("[A-Za-z]", regex=True) == False]

# Display the filtered DataFrame
print( test_df)

                                                 audio  \
0    {'path': '/content/drive/My Drive/karya_braj_d...   
1    {'path': '/content/drive/My Drive/karya_braj_d...   
2    {'path': '/content/drive/My Drive/karya_braj_d...   
3    {'path': '/content/drive/My Drive/karya_braj_d...   
4    {'path': '/content/drive/My Drive/karya_braj_d...   
..                                                 ...   
145  {'path': '/content/drive/My Drive/karya_braj_d...   
146  {'path': '/content/drive/My Drive/karya_braj_d...   
147  {'path': '/content/drive/My Drive/karya_braj_d...   
148  {'path': '/content/drive/My Drive/karya_braj_d...   
149  {'path': '/content/drive/My Drive/karya_braj_d...   

                                         transcription  
0                                          बु घर मिं ।  
1                               हमनै सबेरै चामल बनाए ।  
2                पिताजी नि बच्चन ए लएैं खिलौना खरीदे ।  
3                            मेई भैनि विद्यालै मि एै ।  
4                 

In [18]:
def add_file_path(audio_object):
    # Extract the path from the Audio object
    file_path = audio_object["path"]
    # Prepend the base directory
    full_path = "/content/drive/My Drive/karya_braj_data/translation/281474976710694/" + file_path.split('/')[-1]
    return full_path

# Create a new 'path' column by applying the function to the 'audio' column
def apply_add_file_path(example):
    return {'path': add_file_path(example['audio'])}

train_data = train_data.map(apply_add_file_path)
test_data = test_data.map(apply_add_file_path)


0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [19]:
train_df = pd.DataFrame(train_data)
train_data = Dataset.from_pandas(train_df)
test_df = pd.DataFrame(test_data)
test_data = Dataset.from_pandas(test_df)

chars_to_ignore_regex = '[\,\\\?\.\!\-\;\:\"\“\%\�\।\'‘’|]'

def remove_special_characters(batch):
    batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() + " "
    return batch

train_data = train_data.map(remove_special_characters)
test_data = test_data.map(remove_special_characters)


def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(train_data.remove_columns(["path"]), num_examples=6)


0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

Unnamed: 0,audio,transcription
0,"{'array': [1.5133991837501526e-09, -1.0477378964424133e-09, -6.984919309616089e-10, 1.862645149230957e-09, 1.5133991837501526e-09, 1.6298145055770874e-09, 4.656612873077393e-10, 1.0477378964424133e-09, 3.725290298461914e-09, -3.3760443329811096e-09, 5.238689482212067e-10, -9.313225746154785e-10, -1.0477378964424133e-09, 2.7939677238464355e-09, 1.4551915228366852e-09, -1.6298145055770874e-09, -5.238689482212067e-10, 2.3865140974521637e-09, 1.280568540096283e-09, -9.313225746154785e-10, -2.3283064365386963e-10, 0.0, 0.0, -1.7462298274040222e-09, -2.3283064365386963e-10, -5.471520125865936e-09, 0.0, -1.280568540096283e-09, -4.190951585769653e-09, -1.862645149230957e-09, -1.1641532182693481e-10, -2.2118911147117615e-09, -2.0954757928848267e-09, 0.0, 6.984919309616089e-10, 2.0954757928848267e-09, 8.149072527885437e-10, -6.984919309616089e-10, -2.7939677238464355e-09, -2.3283064365386963e-09, -8.731149137020111e-10, -4.190951585769653e-09, -6.402842700481415e-10, 0.0, 1.6589183360338211e-09, 1.0477378964424133e-09, -3.2014213502407074e-09, -8.149072527885437e-10, -2.444721758365631e-09, -3.259629011154175e-09, -1.1641532182693481e-10, -6.984919309616089e-10, -4.656612873077393e-10, 2.2118911147117615e-09, -8.149072527885437e-10, -9.313225746154785e-10, -2.561137080192566e-09, -6.984919309616089e-10, 1.7462298274040222e-10, 3.6088749766349792e-09, -9.89530235528946e-10, -1.3096723705530167e-09, -1.2514647096395493e-09, 1.1932570487260818e-09, -6.111804395914078e-10, -4.3655745685100555e-10, -2.153683453798294e-09, -1.57160684466362e-09, 7.566995918750763e-10, -2.3283064365386963e-10, -1.4551915228366852e-09, 2.5029294192790985e-09, 5.326000973582268e-09, 3.7834979593753815e-09, 2.444721758365631e-09, -4.656612873077393e-10, 2.3283064365386963e-10, 4.656612873077393e-10, 1.979060471057892e-09, -6.984919309616089e-10, -6.984919309616089e-10, -6.984919309616089e-10, -6.984919309616089e-10, -1.1641532182693481e-09, 2.3283064365386963e-10, 3.026798367500305e-09, -1.0477378964424133e-09, -4.6566128730773926e-09, -1.862645149230957e-09, -3.026798367500305e-09, 0.0, -2.3283064365386963e-10, -1.4842953532934189e-09, -1.3387762010097504e-09, 1.6007106751203537e-10, 6.402842700481415e-10, -5.413312464952469e-09, -4.016328603029251e-09, -3.6088749766349792e-09, 2.270098775625229e-09, ...], 'path': '/content/drive/My Drive/karya_braj_data/translation/281474976710694/281474976729903.wav', 'sampling_rate': 16000}",मैं मां नहीं जाइ सकतो हतो
1,"{'array': [2.473825588822365e-09, -3.2014213502407074e-09, -1.6007106751203537e-10, 4.103640094399452e-09, 1.9063008949160576e-09, -3.5797711461782455e-09, -4.016328603029251e-09, -3.958120942115784e-09, -6.51925802230835e-09, -3.026798367500305e-09, 4.0745362639427185e-10, -2.3865140974521637e-09, -1.9717845134437084e-09, 6.752088665962219e-09, 9.19681042432785e-09, -1.5133991837501526e-09, 3.470631781965494e-09, 6.170012056827545e-09, 3.7834979593753815e-09, -2.8230715543031693e-09, -1.6007106751203537e-09, 1.367880031466484e-09, 8.149072527885437e-10, -2.852175384759903e-09, 4.598405212163925e-09, -5.587935447692871e-09, 5.820766091346741e-10, 3.4924596548080444e-10, 2.9103830456733704e-10, 2.9103830456733704e-10, -1.8044374883174896e-09, -7.334165275096893e-09, -1.3969838619232178e-09, -1.8044374883174896e-09, -6.402842700481415e-10, -1.1641532182693481e-10, -3.259629011154175e-09, -7.159542292356491e-09, -4.336470738053322e-09, -1.2398231774568558e-08, -7.712515071034431e-10, -5.791662260890007e-09, -1.4988472685217857e-09, -2.9685907065868378e-09, 1.1641532182693481e-10, -9.89530235528946e-10, 3.943569026887417e-09, 2.9685907065868378e-09, 1.9354047253727913e-09, -1.127773430198431e-09, 2.306478563696146e-09, -8.585629984736443e-10, 1.367880031466484e-09, 2.317392500117421e-09, -2.240994945168495e-09, 9.022187441587448e-10, -2.066371962428093e-09, 2.9103830456733704e-09, 3.128661774098873e-09, 5.5442797020077705e-09, -2.8594513423740864e-09, -2.6047928258776665e-09, -1.7462298274040222e-10, 5.005858838558197e-09, 6.082700565457344e-09, 1.4115357771515846e-09, -3.899913281202316e-09, -3.103195922449231e-09, -8.731149137020111e-11, 9.74978320300579e-10, -2.9685907065868378e-09, 1.1641532182693481e-09, 2.444721758365631e-09, 3.3178366720676422e-09, 2.5029294192790985e-09, -1.6880221664905548e-09, -1.979060471057892e-09, -1.7462298274040222e-10, 1.1117663234472275e-08, 3.026798367500305e-09, 2.444721758365631e-09, 7.101334631443024e-09, 2.7939677238464355e-09, 9.89530235528946e-10, 1.57160684466362e-09, 3.3178366720676422e-09, 5.238689482212067e-10, -2.0954757928848267e-09, -4.045432433485985e-09, -1.1932570487260818e-09, 4.656612873077393e-10, 4.656612873077393e-10, -5.529727786779404e-10, -1.1641532182693481e-09, -1.6007106751203537e-09, 6.984919309616089e-10, -4.700268618762493e-09, -4.889443516731262e-09, -1.5133991837501526e-09, 6.984919309616089e-10, ...], 'path': '/content/drive/My Drive/karya_braj_data/translation/281474976710694/281474976730037.wav', 'sampling_rate': 16000}",दरबज्जे की सिकडी टूटी एै
2,"{'array': [2.0954757928848267e-09, -2.764863893389702e-09, -1.0040821507573128e-09, 1.6880221664905548e-09, 1.469743438065052e-09, -4.045432433485985e-09, -1.4842953532934189e-09, 2.066371962428093e-09, 0.0, 1.7462298274040222e-09, -1.1641532182693481e-10, -1.1641532182693481e-09, -3.346940502524376e-10, 2.444721758365631e-09, -1.6298145055770874e-09, -6.402842700481415e-10, -1.4551915228366852e-10, 3.899913281202316e-09, 7.8580342233181e-10, -2.5320332497358322e-09, -1.3387762010097504e-09, 5.820766091346741e-11, 1.673470251262188e-09, 1.0622898116707802e-09, 1.0913936421275139e-09, -5.515175871551037e-09, 1.0695657692849636e-09, -1.7171259969472885e-09, -4.001776687800884e-10, -2.939486876130104e-09, 1.0186340659856796e-10, -5.966285243630409e-09, -4.292814992368221e-09, -2.7430360205471516e-09, -8.985807653516531e-10, 5.748006515204906e-10, -5.238689482212067e-10, -4.94765117764473e-09, -2.975866664201021e-09, -4.234607331454754e-09, -6.184563972055912e-10, -5.0640664994716644e-09, -3.5797711461782455e-09, -2.5756889954209328e-09, 1.5425030142068863e-09, 3.346940502524376e-09, -3.8708094507455826e-09, -4.110916052013636e-09, -4.220055416226387e-09, -3.725290298461914e-09, -9.74978320300579e-10, -1.1314114090055227e-09, 3.637978807091713e-10, 3.0631781555712223e-09, 1.964508555829525e-09, 3.790773916989565e-09, 5.602487362921238e-10, 1.7462298274040222e-10, -4.0745362639427185e-10, 3.1868694350123405e-09, -1.811713445931673e-09, -3.4124241210520267e-09, -2.1100277081131935e-09, 3.245077095925808e-09, 2.168235369026661e-09, 5.384208634495735e-10, -3.041350282728672e-09, -1.6880221664905548e-09, 1.8917489796876907e-10, 2.051820047199726e-09, -1.5279510989785194e-09, -2.9103830456733704e-10, 2.9976945370435715e-09, 8.891220204532146e-09, 3.6161509342491627e-09, 2.4811015464365482e-09, 2.0954757928848267e-09, 3.987224772572517e-09, 2.2118911147117615e-09, 9.604264050722122e-10, -9.458744898438454e-10, 2.5320332497358322e-09, 5.675246939063072e-10, 1.4551915228366852e-10, 1.5133991837501526e-09, 4.685716703534126e-09, 2.2846506908535957e-09, -1.1059455573558807e-09, 7.566995918750763e-10, -1.0477378964424133e-09, 1.4551915228366852e-10, -1.5133991837501526e-09, -9.74978320300579e-10, -2.3283064365386963e-09, 2.1827872842550278e-09, 1.3387762010097504e-09, -2.35741026699543e-09, -3.725290298461914e-09, -2.6193447411060333e-09, 1.5133991837501526e-09, ...], 'path': '/content/drive/My Drive/karya_braj_data/translation/281474976710694/281474976730193.wav', 'sampling_rate': 16000}",मोइ काऊ नि ना देखौ
3,"{'array': [1.57160684466362e-09, -1.1641532182693481e-09, -2.9103830456733704e-10, 5.238689482212067e-10, 6.402842700481415e-10, -2.9103830456733704e-10, -2.852175384759903e-09, 1.1641532182693481e-10, 1.5133991837501526e-09, 4.656612873077393e-10, -9.313225746154785e-10, -1.280568540096283e-09, -1.3387762010097504e-09, -2.735760062932968e-09, -2.6775524020195007e-09, -1.2514647096395493e-09, 1.4260876923799515e-09, 3.2014213502407074e-10, 6.693881005048752e-10, -6.984919309616089e-10, -2.1827872842550278e-09, -3.4924596548080444e-10, 5.820766091346741e-11, -4.307366907596588e-09, -4.642060957849026e-09, -6.475602276623249e-09, 1.8335413187742233e-09, 9.604264050722122e-10, -7.8580342233181e-10, -2.3865140974521637e-09, -1.4260876923799515e-09, -2.0081643015146255e-09, -9.89530235528946e-10, -9.022187441587448e-10, -2.2992026060819626e-09, 1.1641532182693481e-10, -3.4924596548080444e-10, -1.3969838619232178e-09, -3.841705620288849e-09, -3.2887328416109085e-09, -4.8603396862745285e-09, 1.4551915228366852e-10, -3.3178366720676422e-09, -1.8335413187742233e-09, -1.3242242857813835e-09, -1.4260876923799515e-09, 3.4924596548080444e-10, -8.731149137020111e-11, -1.280568540096283e-09, -4.540197551250458e-09, -4.0745362639427185e-10, -5.820766091346741e-10, 8.731149137020111e-10, 1.0477378964424133e-09, -1.7462298274040222e-09, 4.656612873077393e-10, -1.0477378964424133e-09, 1.7462298274040222e-09, -4.656612873077393e-10, 1.57160684466362e-09, -1.57160684466362e-09, 8.149072527885437e-10, 1.1641532182693481e-10, 1.949956640601158e-09, 8.149072527885437e-10, 7.566995918750763e-10, 8.149072527885437e-10, 2.3283064365386963e-10, 9.313225746154785e-10, 7.566995918750763e-10, -7.566995918750763e-10, 1.0477378964424133e-09, 4.307366907596588e-09, 3.3760443329811096e-09, 1.1641532182693481e-10, -4.656612873077393e-10, -1.6298145055770874e-09, 1.280568540096283e-09, 5.122274160385132e-09, -4.656612873077393e-10, -4.0745362639427185e-09, 1.1059455573558807e-09, -2.3283064365386963e-10, -1.3387762010097504e-09, -5.820766091346741e-11, 4.889443516731262e-09, -1.3533281162381172e-09, -2.6775524020195007e-09, -1.1059455573558807e-09, -5.820766091346741e-11, 4.0745362639427185e-10, 2.9103830456733704e-10, -3.14321368932724e-09, -3.026798367500305e-09, -1.4551915228366852e-09, 6.984919309616089e-10, -2.153683453798294e-09, -3.958120942115784e-09, -2.5029294192790985e-09, 1.9208528101444244e-09, ...], 'path': '/content/drive/My Drive/karya_braj_data/translation/281474976710694/281474976730137.wav', 'sampling_rate': 16000}",बु घर में औ
4,"{'array': [6.984919309616089e-10, -3.026798367500305e-09, -2.0954757928848267e-09, -1.5133991837501526e-09, -1.5133991837501526e-09, -4.0745362639427185e-09, -2.2118911147117615e-09, 1.5133991837501526e-09, 4.6566128730773926e-09, 1.6298145055770874e-09, -3.259629011154175e-09, -2.0954757928848267e-09, 1.6298145055770874e-09, 5.005858838558197e-09, 5.238689482212067e-10, -1.7462298274040222e-09, -5.238689482212067e-10, 2.3283064365386963e-10, 1.2223608791828156e-09, -6.111804395914078e-10, 1.5133991837501526e-09, 8.149072527885437e-10, 2.2118911147117615e-09, -4.540197551250458e-09, -9.313225746154785e-10, -4.540197551250458e-09, 2.0954757928848267e-09, 9.313225746154785e-10, -1.979060471057892e-09, -2.6775524020195007e-09, 6.984919309616089e-10, -1.1641532182693481e-09, 0.0, -1.7462298274040222e-09, -1.0477378964424133e-09, -5.238689482212067e-10, -1.4551915228366852e-10, -2.3283064365386963e-09, -8.731149137020111e-10, -2.0227162167429924e-09, -2.561137080192566e-09, -1.280568540096283e-09, -1.0477378964424133e-09, -3.259629011154175e-09, -1.2223608791828156e-09, 8.731149137020111e-10, -3.3760443329811096e-09, -4.656612873077393e-10, -3.725290298461914e-09, -1.979060471057892e-09, -1.2660166248679161e-09, -1.7462298274040222e-10, 1.3387762010097504e-09, 1.3387762010097504e-09, -1.8335413187742233e-09, -3.958120942115784e-09, -1.673470251262188e-09, 6.83940015733242e-10, -8.149072527885437e-10, 3.026798367500305e-09, -2.270098775625229e-09, -9.313225746154785e-10, 1.862645149230957e-09, 4.831235855817795e-09, 3.92901711165905e-09, 3.812601789832115e-09, 8.149072527885437e-10, 2.3283064365386963e-10, 2.240994945168495e-09, 3.958120942115784e-09, 1.280568540096283e-09, 1.979060471057892e-09, 2.3283064365386963e-09, 4.016328603029251e-09, 3.92901711165905e-09, 4.656612873077393e-10, -1.7462298274040222e-09, 4.656612873077393e-10, 1.0477378964424133e-09, 1.4842953532934189e-09, -2.0954757928848267e-09, 1.5133991837501526e-09, -4.0745362639427185e-10, -4.94765117764473e-10, -2.9103830456733704e-10, 2.5029294192790985e-09, -2.270098775625229e-09, -5.471520125865936e-09, -4.0745362639427185e-09, -4.0745362639427185e-09, -2.3283064365386963e-10, -1.3387762010097504e-09, -2.0227162167429924e-09, -2.852175384759903e-09, 2.270098775625229e-09, -1.1641532182693481e-10, -2.2118911147117615e-09, -5.820766091346741e-09, -3.7834979593753815e-09, 4.656612873077393e-10, ...], 'path': '/content/drive/My Drive/karya_braj_data/translation/281474976710694/281474976717848.wav', 'sampling_rate': 16000}",बु बएै एक छोरा देतु एै
5,"{'array': [6.082700565457344e-09, -1.2223608791828156e-09, 9.313225746154785e-10, -1.5133991837501526e-09, -6.984919309616089e-10, -2.9685907065868378e-09, -9.604264050722122e-10, 4.016328603029251e-09, 8.149072527885437e-10, 3.026798367500305e-09, 7.566995918750763e-10, -5.820766091346741e-10, -1.6880221664905548e-09, 5.587935447692871e-09, 1.0011717677116394e-08, 1.7462298274040222e-09, -1.7462298274040222e-10, 1.9208528101444244e-09, 3.725290298461914e-09, 1.2223608791828156e-09, 1.862645149230957e-09, -1.6298145055770874e-09, -1.1641532182693481e-09, 3.4924596548080444e-10, 1.3969838619232178e-09, -4.423782229423523e-09, 1.1641532182693481e-10, -2.2118911147117615e-09, -2.2118911147117615e-09, -2.561137080192566e-09, -4.452886059880257e-09, -6.257323548197746e-09, -2.939486876130104e-09, -2.0372681319713593e-09, -2.1827872842550278e-10, 1.5861587598919868e-09, 3.2014213502407074e-10, -3.841705620288849e-09, -4.6566128730773926e-09, -9.080395102500916e-09, -6.9267116487026215e-09, 1.3969838619232178e-09, -1.6298145055770874e-09, 6.984919309616089e-10, 1.280568540096283e-09, -1.1641532182693481e-10, -1.076841726899147e-08, -5.0640664994716644e-09, -6.693881005048752e-09, -5.3551048040390015e-09, -8.149072527885437e-10, -2.5320332497358322e-09, -3.3760443329811096e-09, -9.022187441587448e-10, -4.2782630771398544e-09, -1.622538547962904e-09, -2.1827872842550278e-09, 1.6589183360338211e-09, -2.3865140974521637e-09, 5.296897143125534e-09, -3.4924596548080444e-10, -3.2014213502407074e-09, 2.3283064365386963e-10, 2.0954757928848267e-09, 3.550667315721512e-09, 4.190951585769653e-09, 3.3760443329811096e-09, -5.238689482212067e-10, -2.9103830456733704e-10, 1.2223608791828156e-09, -5.820766091346741e-10, 3.6961864680051804e-09, 1.8917489796876907e-09, 2.713932190090418e-09, 6.548361852765083e-10, 2.4883775040507317e-09, -1.1350493878126144e-09, -1.6007106751203537e-09, 1.6007106751203537e-09, 4.656612873077393e-10, 7.566995918750763e-10, 2.852175384759903e-09, 7.566995918750763e-10, -7.566995918750763e-10, 2.9103830456733704e-10, 2.735760062932968e-09, 2.270098775625229e-09, -4.540197551250458e-09, -7.566995918750763e-10, -6.402842700481415e-10, 2.561137080192566e-09, 4.0745362639427185e-10, -3.434251993894577e-09, -3.14321368932724e-09, 8.149072527885437e-10, 2.735760062932968e-09, -2.3865140974521637e-09, -4.48198989033699e-09, -3.6670826375484467e-09, 8.149072527885437e-10, ...], 'path': '/content/drive/My Drive/karya_braj_data/translation/281474976710694/281474976729866.wav', 'sampling_rate': 16000}",बु सिर्फ बई के परिबार के लएै काम कर्रई एै


In [20]:
def extract_all_chars(batch):
  all_text = " ".join(batch["transcription"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

# Apply the function to the dataset
vocab_test =  test_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns= test_data.column_names)
vocab_test = test_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_data.column_names)

# Combine the vocabularies from  test and test datasets
vocab_list = list(set(vocab_test["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}


vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

# Verify the vocabulary
print("Vocabulary:", vocab_list)


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Vocabulary: ['ि', 'ं', 'औ', 'अ', 'फ', 'ब', 'आ', 'छ', 'ह', 'क', 'म', 'र', 'ी', 'ु', 'ऊ', 'ए', 'द', 'ड', 'ड़', 'ल', 'ौ', 'व', 'न', 'ष', 'ॉ', 'थ', 'ृ', 'ओ', 'ख', 'ा', 'च', 'भ', 'ध', 'ज', 'ू', '0', 'स', 'प', 'ई', 'इ', 'ै', 'त', '्', 'य', 'घ', 'ग', 'ढ', '1', 'झ', 'े', 'ठ', ' ', 'उ', '़', 'ण', 'ज़', 'ो', 'ट']


In [21]:
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained("/content/drive/My Drive/wav2vec2-large-xlsr-hindi/")

[]

In [None]:
train_data

Dataset({
    features: ['audio', 'transcription', 'path'],
    num_rows: 295
})

In [22]:
#!pip install torchaudio
#import torchaudio
#import numpy as np
np.object = object

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["transcription"]
    return batch

train_data = train_data.map(speech_file_to_array_fn, remove_columns=train_data.column_names)
test_data = test_data.map(speech_file_to_array_fn, remove_columns=test_data.column_names)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [23]:
import librosa
import numpy as np

def resample(batch):
    batch["speech"] = librosa.resample(np.asarray(batch["speech"]), orig_sr=8000, target_sr=16_000)
    batch["sampling_rate"] = 16_000
    return batch

train_data = train_data.map(resample, num_proc=4)
test_data = test_data.map(resample, num_proc=4)

  self.pid = os.fork()
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [26]:
#import random
import IPython.display as ipd
rand_int = random.randint(0, len(train_data))

print(train_data[rand_int]["target_text"])

ipd.Audio(data=(train_data[rand_int]["speech"]), autoplay=True, rate=16000)

बु रोज बजार जात एैं  


In [27]:
print("Target text:", train_data[rand_int]["target_text"])
print("Input array shape:", np.asarray(train_data[rand_int]["speech"]).shape)
print("Sampling rate:", train_data[rand_int]["sampling_rate"])

Target text: बु रोज बजार जात एैं  
Input array shape: (29722,)
Sampling rate: 16000


In [28]:
rand_int = random.randint(0, len(test_data))
print(test_data[rand_int]["target_text"])
ipd.Audio(data=np.asarray(test_data[rand_int]["speech"]), autoplay=True, rate=16000)

मुझे यह काम करने दो 


In [29]:
print("Target text:", test_data[rand_int]["target_text"])
print("Input array shape:", np.asarray(train_data[rand_int]["speech"]).shape)
print("Sampling rate:", test_data[rand_int]["sampling_rate"])

Target text: मुझे यह काम करने दो 
Input array shape: (32322,)
Sampling rate: 16000


In [30]:
def prepare_dataset(batch):

    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch



train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, batch_size=8, num_proc=4, batched=True)
test_data = test_data.map(prepare_dataset, remove_columns=test_data.column_names, batch_size=8, num_proc=4, batched=True)

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 #0:   0%|          | 0/19 [00:00<?, ?ba/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 #1:   0%|          | 0/19 [00:00<?, ?ba/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 #2:   0%|          | 0/19 [00:00<?, ?ba/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 #3:   0%|          | 0/19 [00:00<?, ?ba/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 #0:   0%|          | 0/5 [00:00<?, ?ba/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 #1:   0%|          | 0/5 [00:00<?, ?ba/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 #2:   0%|          | 0/5 [00:00<?, ?ba/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 #3:   0%|          | 0/5 [00:00<?, ?ba/s]



In [31]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

In [32]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = load_metric("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)

    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Downloading:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [33]:
!pip install accelerate==0.30.1

Collecting accelerate==0.30.1
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/302.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.32.1
    Uninstalling accelerate-0.32.1:
      Successfully uninstalled accelerate-0.32.1
Successfully installed accelerate-0.30.1


In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

model.freeze_feature_extractor()


training_args = TrainingArguments(
  output_dir="/content/drive/My Drive/wav2vec2-large-xlsr-hindi/",
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=35,
  fp16=True,
  remove_unused_columns=False,
  gradient_checkpointing=True,
  save_steps=50,
  eval_steps=50,
  logging_steps=5,
  learning_rate=1e-4,
  warmup_steps=100,
  save_total_limit=2,
)
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=processor.feature_extractor,
)


config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Wer
50,10.1054,8.988542,1.0
100,3.7642,3.591779,1.0
150,3.3825,3.354243,1.0
200,3.3524,3.328186,1.0
250,3.3006,3.281578,1.0
300,3.2746,3.205744,1.0
350,3.1942,3.135901,1.0
400,3.1249,3.086755,1.001244
450,3.0511,3.026576,1.001244
500,3.0841,2.981641,0.992537




TrainOutput(global_step=1295, training_loss=3.0187599959060494, metrics={'train_runtime': 2702.6773, 'train_samples_per_second': 7.731, 'train_steps_per_second': 0.479, 'total_flos': 1.3581920464839462e+18, 'train_loss': 3.0187599959060494, 'epoch': 34.53333333333333})

In [None]:
 output_dir="/content/drive/MyDrive/wav2vec2-large-xlsr-hindi/"
 model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
 model_to_save.save_pretrained(output_dir)
 trainer.save_model(output_dir)

In [None]:
!pip install datasets
!pip install soundfile
!pip install transformers



In [None]:
import os
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import soundfile as sf
import torchaudio
import numpy as np
# Load the trained model and processor
model = Wav2Vec2ForCTC.from_pretrained("/content/drive/My Drive/wav2vec2-large-xlsr-hindi/")
processor = Wav2Vec2Processor.from_pretrained("/content/drive/My Drive/wav2vec2-large-xlsr-hindi/")

# Function to transcribe audio
def transcribe_audio(file_path):
    # Load audio file
    speech, sample_rate = sf.read(file_path)

    # Resample the audio to 16000 Hz if necessary
    if sample_rate != 16000:
        speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)


    # Preprocess the audio
    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values

    # Perform inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode the predicted ids to text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

    return transcription[0]

# Function to transcribe all .wav files in a directory
def transcribe_directory(dataset_path):
    transcriptions = {}
    for filename in os.listdir(dataset_path):
        if filename.endswith(".wav"):
            file_path = os.path.join(dataset_path, filename)
            transcription = transcribe_audio(file_path)
            transcriptions[filename] = transcription
    return transcriptions

# Example usage
dataset_path = "/content/drive/MyDrive/karya_braj_data/translation/281474976710698"

# Transcribe all .wav files in the directory
transcriptions = transcribe_directory(dataset_path)
for filename, transcription in transcriptions.items():
    print(f"Transcription for {filename}: {transcription}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Transcription for 281474976727598.wav: मै कसारा  नैं साकौ
Transcription for 281474976727586.wav: तुम रोचामर खात
Transcription for 281474976727588.wav: बन ुोरितर पैरे चा बरखा एै
Transcription for 281474976727595.wav: बु कल बजार गा तौ
Transcription for 281474976727599.wav: मतै पुरै सान न गरैगै
Transcription for 281474976727590.wav: का तुम मोा जानत
Transcription for 281474976727587.wav: मै बाएै गत्ति बाएै लै जाब चातु सुम मर ग
Transcription for 281474976727596.wav: तम बएक मेला म जा रा
Transcription for 281474976727597.wav: बौोन तुमा बलौ कर एै
Transcription for 281474976727592.wav: बु मो नमपेरै र एै
Transcription for 281474976727594.wav: मै बाएै कोच्चा दै रौ
Transcription for 281474976727589.wav: मै पास पसा ना ै
Transcription for 281474976727593.wav: बु कलि कलकत्तौ जाऔ
Transcription for 281474976727566.wav: तुर पुरम मै गतु एै
Transcription for 281474976727585.wav: बु गर मे ग
Transcription for 281474976727583.wav: तुम माै र बचचागो
Transcription for 281474976727570.wav: मै सलि चाबर पात क
Tran