<a href="https://colab.research.google.com/github/WhozKunal/speehtotext_assamese/blob/main/Untitled14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import os

# Load the pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("infinitejoy/Wav2Vec2-Large-XLSR-53-Assamese")
model = Wav2Vec2ForCTC.from_pretrained("infinitejoy/Wav2Vec2-Large-XLSR-53-Assamese")

# Move model to GPU if available
model.to("cuda")

# Directory where the audio files are stored
audio_directory = "/content/clips"

# Get list of audio files in the directory
audio_files = [f for f in os.listdir(audio_directory) if f.endswith(".mp3")]

# Resampler to convert audio to 16kHz
resampler = torchaudio.transforms.Resample(48_000, 16_000)

# Output file to save transcriptions
output_file = "/content/transcriptions.txt"

# Open the output file in write mode
with open(output_file, 'w', encoding='utf-8') as f:
    # Iterate over each audio file and process it
    for audio_file in audio_files:
        audio_path = os.path.join(audio_directory, audio_file)

        # Load the audio file
        waveform, sample_rate = torchaudio.load(audio_path)

        # Resample the audio to 16kHz (if needed)
        waveform = resampler(waveform)

        # Process the audio using the processor
        inputs = processor(waveform.squeeze().numpy(), sampling_rate=16_000, return_tensors="pt")

        # Run the audio through the model
        with torch.no_grad():
            logits = model(inputs.input_values.to("cuda")).logits

        # Get the predicted ids (the word tokens)
        predicted_ids = torch.argmax(logits, dim=-1)

        # Decode the predicted ids to text
        transcription = processor.batch_decode(predicted_ids)

        # Save the transcription for each audio file in the output file
        f.write(f"Predicted Transcription for {audio_file}: {transcription[0]}\n")

print(f"Transcriptions saved to {output_file}")

pytorch_model.bin:  66%|######6   | 839M/1.26G [00:00<?, ?B/s]

Transcriptions saved to /content/transcriptions.txt


In [1]:
!pip install -q datasets evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import os
from datasets import Dataset
import torchaudio
from transformers import Wav2Vec2Processor

# Define paths
audio_directory = '/content/clips'  # Directory containing the audio files
transcription_file = '/content/transcriptions.txt'  # Path to the transcription file

# Define a function to load audio and preprocess it
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    # Optionally, resample to 16kHz
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform.squeeze()


# Function to parse transcription.txt file
def parse_transcription_file(transcription_file):
    audio_files = []
    transcriptions = []

    with open(transcription_file, 'r', encoding='utf-8') as file:
        for line in file.readlines():
            # Split by ': ' assuming format 'Predicted Transcription for {audio_file}: {transcription}'
            if 'Predicted Transcription for' in line:
                parts = line.split(': ')
                audio_file = parts[0].split('for ')[1]  # Extract audio file name
                transcription = parts[1].strip()  # Extract transcription
                audio_files.append(audio_file)
                transcriptions.append(transcription)

    return audio_files, transcriptions

# Load the audio files and transcriptions from the transcription file
audio_files, transcriptions = parse_transcription_file(transcription_file)

# Create a Dataset from the loaded data
data = {'audio': audio_files, 'transcription': transcriptions}
dataset = Dataset.from_dict(data)

# Preprocess the dataset
def preprocess_data(example):
    # Load audio and process transcription
    example['audio'] = load_audio(os.path.join(audio_directory, example['audio']))
    example['transcription'] = process_transcription(example['transcription'])
    return example

# Apply preprocessing
dataset = dataset.map(preprocess_data)

# Print a sample from the dataset to check
print(dataset[0])  # Print the first sample from the dataset

Map:   0%|          | 0/509 [00:00<?, ? examples/s]

NameError: name 'process_transcription' is not defined

In [13]:
import os
from datasets import Dataset
import torchaudio
from transformers import Wav2Vec2Processor

# Define paths
audio_directory = '/content/clips'  # Directory containing the audio files
transcription_file = '/content/test.tsv'  # Path to the transcription file

# Define a function to load and preprocess audio
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform.squeeze()

# Function to parse transcription.txt file
def parse_transcription_file(transcription_file):
    audio_files = []
    transcriptions = []

    with open(transcription_file, 'r', encoding='utf-8') as file:
        for line in file.readlines():
            if 'Predicted Transcription for' in line:
                parts = line.split(': ')
                audio_file = parts[0].split('for ')[1]
                transcription = parts[1].strip()
                audio_files.append(audio_file)
                transcriptions.append(transcription)

    return audio_files, transcriptions

# Load audio files and transcriptions
audio_files, transcriptions = parse_transcription_file(transcription_file)

# Create a dataset
data = {'audio': audio_files, 'transcription': transcriptions}
dataset = Dataset.from_dict(data)

# Load processor
processor = Wav2Vec2Processor.from_pretrained("infinitejoy/Wav2Vec2-Large-XLSR-53-Assamese")

# Define preprocess_data function
def preprocess_data(example):
    # Load and process audio
    waveform = load_audio(os.path.join(audio_directory, example['audio']))
    input_values = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True).input_values[0]
    example['input_values'] = input_values

    # Process transcription
    transcription = example['transcription'].lower()  # Lowercase transcription
    labels = processor.tokenizer(transcription, return_tensors="pt", padding=True).input_ids[0]
    example['labels'] = labels

    return example

# Apply preprocessing
dataset = dataset.map(preprocess_data)

# Print a sample from the dataset
print(dataset[0])

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

IndexError: Invalid key: 0 is out of bounds for size 0

In [3]:
if "labels" in dataset[0]:
  print("yes")
else:
  print("no")

yes


In [4]:
for n in dataset[0]:
  print(n)

audio
transcription
input_values
labels


In [5]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load the pre-trained model
model_name = "infinitejoy/Wav2Vec2-Large-XLSR-53-Assamese"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned-as",  # Directory to save checkpoints
    group_by_length=True,                 # Group audio samples of similar lengths
    per_device_train_batch_size=4,        # Adjust based on your GPU memory
    evaluation_strategy="steps",          # Evaluate during training
    save_steps=500,                       # Save checkpoint every 500 steps
    eval_steps=500,                       # Evaluate every 500 steps
    logging_steps=100,                    # Log metrics every 100 steps
    learning_rate=3e-4,                   # Learning rate for fine-tuning
    warmup_steps=500,                     # Warmup steps
    num_train_epochs=3,                   # Number of training epochs
)




In [7]:
!pip install --upgrade transformers



In [8]:
class CustomDataCollator:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):
        # Separate the features (audio and transcription)
        audio_features = [feature['input_values'] for feature in features]
        labels = [feature['labels'] for feature in features]

        # Apply padding to audio features and labels
        # Convert audio_features list to a dictionary for padding
        audio_features = {'input_values': audio_features}
        audio_features = self.processor.pad(
            audio_features, padding=self.padding, return_tensors="pt"
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(label) for label in labels], batch_first=True, padding_value=-100
        )

        return {'input_values': audio_features['input_values'], 'labels': labels}

In [9]:
dataset

Dataset({
    features: ['audio', 'transcription', 'input_values', 'labels'],
    num_rows: 509
})

In [15]:
class CustomDataCollator:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):
        # Separate the features (audio and transcription)
        import torch # Import torch here
        audio_features = [feature['input_values'] for feature in features]
        labels = [feature['labels'] for feature in features]

        # Apply padding to audio features and labels
        # Convert audio_features list to a dictionary for padding
        audio_features = {'input_values': audio_features}
        audio_features = self.processor.pad(
            audio_features, padding=self.padding, return_tensors="pt"
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(label) for label in labels], batch_first=True, padding_value=-100
        )

        return {'input_values': audio_features['input_values'], 'labels': labels}

In [20]:
!pip install -q jiwer

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/3.1 MB[0m [31m43.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m66.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
import evaluate

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = pred_logits.argmax(-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Compute Word Error Rate (WER)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [13]:
# Split the dataset into training and validation sets (80% for training, 20% for validation)
dataset = dataset.train_test_split(test_size=0.2)

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",  # Directory to store the results
    evaluation_strategy="epoch",  # Evaluate after each epoch
    per_device_train_batch_size=16,  # Adjust based on your available GPU memory
    per_device_eval_batch_size=16,  # Adjust based on your available GPU memory
    num_train_epochs=3,  # Number of epochs for training
    logging_dir="./logs",  # Directory to store logs
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=CustomDataCollator(processor=processor, padding=True),
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor,  # Link the processor
)


trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.345903
2,No log,1.206136
3,No log,1.174783


TrainOutput(global_step=78, training_loss=1.4566136873685396, metrics={'train_runtime': 629.7762, 'train_samples_per_second': 1.939, 'train_steps_per_second': 0.124, 'total_flos': 3.382171663706899e+17, 'train_loss': 1.4566136873685396, 'epoch': 3.0})

In [17]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 1.1747831106185913, 'eval_runtime': 23.464, 'eval_samples_per_second': 4.347, 'eval_steps_per_second': 0.298, 'epoch': 3.0}


In [18]:
model.save_pretrained("fine_tuned_model")
processor.save_pretrained("fine_tuned_model")

[]

In [4]:
from transformers import pipeline

asr_pipeline = pipeline("automatic-speech-recognition", model="fine_tuned_model")
result = asr_pipeline("/content/common_voice_as_41243064.mp3")
print(result["text"])

ValueError: Unrecognized model in fine_tuned_model. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, graphormer, grounding-dino, groupvit, hiera, hubert, ibert, idefics, idefics2, idefics3, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, time_series_transformer, timesformer, timm_backbone, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zoedepth

In [22]:
import evaluate

wer_metric = evaluate.load("wer")

# Assuming `predictions` and `references` are lists of strings
wer = wer_metric.compute(predictions=predictions, references=references)
print(f"Word Error Rate (WER): {wer:.2%}")

NameError: name 'predictions' is not defined

In [8]:
import pandas as pd

df = pd.read_csv("/content/test.tsv", sep="\t")
df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,4d49bfe22b9c07fce9de4613cb2b50506a859d8e2f95ad...,common_voice_as_25111102.mp3,আমাৰ ঘৰত তলা মৰা অৱস্থাত ওছৰৰে ঘৰত মানুহ নথকা ...,2,0,,,,,as,
1,7f7e3d4885ca37cffd2823d99e37149d6edbd02c7e82ae...,common_voice_as_25375265.mp3,"মন ভালো লগা নাই, বেয়াও, নগা নাই।",2,0,,,,,as,
2,968b64cab86cb279cea4b0e444c89e8a729aae489411c4...,common_voice_as_27252115.mp3,ইয়াৰ পাচত সূৰ্য্যই যেতিয়া ৰোগীক কেনে দেখিলে সু...,2,0,,,,,as,
3,d35e12b7e4479c77a2b5685595d0146d9c965d39536dcd...,common_voice_as_23664176.mp3,তেওঁ ৰামৰ এজন ভাল অনুগামী।,2,0,,,,,as,
4,a3e26ba721c76f6e9863a24aa42fb5c900855d5bd70a41...,common_voice_as_21808341.mp3,যদিও এই উদ্যোগৰ ইতিহাস পুৰণি।,2,1,,,,,as,


In [9]:
df.columns

Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accents', 'variant', 'locale', 'segment'],
      dtype='object')

In [10]:
df = df.drop(columns = ['client_id', 'up_votes', 'down_votes', 'age',
       'gender', 'accents', 'variant', 'locale', 'segment'], axis = 1)

In [11]:
df.columns

Index(['path', 'sentence'], dtype='object')

In [12]:
df.to_csv("test.tsv", sep="\t", index=False)

In [14]:
import os
import pandas as pd
from datasets import Dataset
import torchaudio
from transformers import Wav2Vec2Processor

# Define paths
audio_directory = '/content/clips'  # Directory containing the audio files
transcription_file = '/content/test.tsv'  # Path to the transcription file

# Define a function to load and preprocess audio
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform.squeeze()

# Function to load the TSV file
def load_transcription_file(transcription_file):
    df = pd.read_csv(transcription_file, sep='\t')
    audio_files = df['path'].tolist()  # List of audio file paths
    transcriptions = df['sentence'].tolist()  # List of transcriptions
    return audio_files, transcriptions

# Load audio files and transcriptions
audio_files, transcriptions = load_transcription_file(transcription_file)

# Create a dataset
data = {'audio': audio_files, 'transcription': transcriptions}
dataset = Dataset.from_dict(data)

# Load processor
processor = Wav2Vec2Processor.from_pretrained("infinitejoy/Wav2Vec2-Large-XLSR-53-Assamese")

# Define preprocess_data function
def preprocess_data(example):
    # Load and process audio
    waveform = load_audio(os.path.join(audio_directory, example['audio']))
    input_values = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True).input_values[0]
    example['input_values'] = input_values

    # Process transcription
    transcription = example['transcription'].lower()  # Lowercase transcription
    labels = processor.tokenizer(transcription, return_tensors="pt", padding=True).input_ids[0]
    example['labels'] = labels

    return example

# Apply preprocessing
dataset = dataset.map(preprocess_data)

# Print a sample from the dataset
print(dataset[0])



Map:   0%|          | 0/548 [00:00<?, ? examples/s]

{'audio': 'common_voice_as_25111102.mp3', 'transcription': 'আমাৰ ঘৰত তলা মৰা অৱস্থাত ওছৰৰে ঘৰত মানুহ নথকা সময়ত চুৰি হৈছিল।', 'input_values': [0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499753686133772, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.00022499755141325295, 0.0002249975805170834, 0.00022499755141325295, 0.00022499755141325295, 0.0002249975805170834, 0.00022499755141325295, 0.00022499753686133772, 0.00022499744954984635, 0.00022499755141325295, 0.00022499759506899863, 0.00022499753686133772, 0.0002249976241728291, 0.0002249976096209138

In [15]:
for n in dataset[0]:
  print(n)

audio
transcription
input_values
labels


In [16]:
dataset

Dataset({
    features: ['audio', 'transcription', 'input_values', 'labels'],
    num_rows: 548
})

In [17]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load the pre-trained model
model_name = "infinitejoy/Wav2Vec2-Large-XLSR-53-Assamese"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)



pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned-as",  # Directory to save checkpoints
    group_by_length=True,                 # Group audio samples of similar lengths
    per_device_train_batch_size=4,        # Adjust based on your GPU memory
    evaluation_strategy="steps",          # Evaluate during training
    save_steps=500,                       # Save checkpoint every 500 steps
    eval_steps=500,                       # Evaluate every 500 steps
    logging_steps=100,                    # Log metrics every 100 steps
    learning_rate=3e-4,                   # Learning rate for fine-tuning
    warmup_steps=500,                     # Warmup steps
    num_train_epochs=3,                   # Number of training epochs
)



In [28]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import os

In [29]:
class CustomDataCollator:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):
        # Separate the features (audio and transcription)
        audio_features = [feature['input_values'] for feature in features]
        labels = [feature['labels'] for feature in features]

        # Apply padding to audio features and labels
        # Convert audio_features list to a dictionary for padding
        audio_features = {'input_values': audio_features}
        audio_features = self.processor.pad(
            audio_features, padding=self.padding, return_tensors="pt"
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(label) for label in labels], batch_first=True, padding_value=-100
        )

        return {'input_values': audio_features['input_values'], 'labels': labels}

In [26]:
import evaluate

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = pred_logits.argmax(-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Compute Word Error Rate (WER)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


In [27]:
# Split the dataset into training and validation sets (80% for training, 20% for validation)
dataset = dataset.train_test_split(test_size=0.2)

AttributeError: 'DatasetDict' object has no attribute 'train_test_split'

In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'input_values', 'labels'],
        num_rows: 438
    })
    test: Dataset({
        features: ['audio', 'transcription', 'input_values', 'labels'],
        num_rows: 110
    })
})

In [30]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",  # Directory to store the results
    evaluation_strategy="epoch",  # Evaluate after each epoch
    per_device_train_batch_size=16,  # Adjust based on your available GPU memory
    per_device_eval_batch_size=16,  # Adjust based on your available GPU memory
    num_train_epochs=3,  # Number of epochs for training
    logging_dir="./logs",  # Directory to store logs
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=CustomDataCollator(processor=processor, padding=True),
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor,  # Link the processor
)


trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.952172
2,No log,0.871976
3,No log,0.862258


TrainOutput(global_step=84, training_loss=1.1990020388648623, metrics={'train_runtime': 644.1297, 'train_samples_per_second': 2.04, 'train_steps_per_second': 0.13, 'total_flos': 3.450960208738862e+17, 'train_loss': 1.1990020388648623, 'epoch': 3.0})

In [31]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.862257719039917, 'eval_runtime': 14.8686, 'eval_samples_per_second': 7.398, 'eval_steps_per_second': 0.471, 'epoch': 3.0}


In [32]:
model.save_pretrained("fine_tuned_model")
processor.save_pretrained("fine_tuned_model")

[]

In [33]:
from transformers import pipeline

asr_pipeline = pipeline("automatic-speech-recognition", model="fine_tuned_model")
result = asr_pipeline("/content/common_voice_as_41243064.mp3")
print(result["text"])

Device set to use cuda:0


এতিয়া দদায়েৰ লাগিছে শুঁ মদন মন্দলৰ ভাযয়েকলৈ দিবলোৈ


In [2]:
import pandas as pd
df_test = pd.read_csv("/content/other.tsv", sep="\t")
df_test.head(5)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,e0dffcb0aad1d31b211d8d83e8f483c5fa43be3da81c79...,common_voice_as_32546055.mp3,নীলাচল পাহাৰত অৱস্থিত শক্তিপীঠ কামাখ্যা মন্দিৰ।,1,0,,,,,as,
1,e0dffcb0aad1d31b211d8d83e8f483c5fa43be3da81c79...,common_voice_as_32546167.mp3,সেই দুখেৰে ভৰা মুখৰ ভিতৰৰ পৰাও সৌন্দৰ্য ফুটি ও...,1,0,twenties,male,,,as,
2,e0dffcb0aad1d31b211d8d83e8f483c5fa43be3da81c79...,common_voice_as_32546230.mp3,এখন ৰাজ্যত এটা বৰ ডাঙৰ হাতী আৰু এটা বাঘ আছিল।,1,0,twenties,male,,,as,
3,e0dffcb0aad1d31b211d8d83e8f483c5fa43be3da81c79...,common_voice_as_32546232.mp3,যেতিয়া বাঘ মৰাৰ দিন পৰিলহি।,1,0,twenties,male,,,as,
4,e0dffcb0aad1d31b211d8d83e8f483c5fa43be3da81c79...,common_voice_as_32546233.mp3,এদিন এটা দুখীয়া মানুহে জকাই বাবলৈ গৈ দিনৰ দিন...,1,0,twenties,male,,,as,


In [3]:
df_test.columns

Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accents', 'variant', 'locale', 'segment'],
      dtype='object')

In [4]:
df_test = df_test.drop(columns=['client_id', 'up_votes', 'down_votes', 'age',
       'gender', 'accents', 'variant', 'locale', 'segment'], axis =1)

In [5]:
df_test.to_csv("other.tsv", sep = '\t', index=False)

In [1]:
!pip install -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m35.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
!pip install -q jiwer

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/3.1 MB[0m [31m16.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m2.0/3.1 MB[0m [31m28.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m34.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
from transformers import pipeline
import evaluate

# Load the fine-tuned model using the ASR pipeline
model_path = "/content/fine_tuned_model"  # Replace with the actual directory where your model is saved
asr_pipeline = pipeline("automatic-speech-recognition", model=model_path)

# Load the WER metric from the evaluate library
wer_metric = evaluate.load("wer")

# Path to the audio file
audio_file = "/content/common_voice_as_35924720.mp3"  # Replace with the path to your audio file

# Reference transcription (ground truth)
reference_transcription = "তথাপি যে ছোৱালীজনীৰ মূৰ খাৰলৈ ওলাইছে, সেইটো অকল সম্পত্তিৰ লোভত।" # Replace with the actual transcription

# Use the ASR pipeline to transcribe the audio file
result = asr_pipeline(audio_file)
predicted_transcription = result["text"]

# Calculate Word Error Rate (WER)
wer = wer_metric.compute(predictions=[predicted_transcription], references=[reference_transcription])

# Print the predicted and reference transcriptions along with the WER
print("Predicted Transcription:", predicted_transcription)
print("Reference Transcription:", reference_transcription)
print("WER:", wer)

Device set to use cpu


Predicted Transcription: তচাকিযী চোৱালী যেনৰ নৰখাৰ লৈ ওলাইছিল সেইতটো অসল সম্পগ্তেিলোব
Reference Transcription: তথাপি যে ছোৱালীজনীৰ মূৰ খাৰলৈ ওলাইছে, সেইটো অকল সম্পত্তিৰ লোভত।
WER: 1.0


In [9]:
wer_metric = evaluate.load("wer")

# Normalize the transcriptions by lowercasing and stripping whitespace
predicted_transcription = predicted_transcription.strip().lower()
reference_transcription = reference_transcription.strip().lower()

# Compute WER after normalizing text
wer = wer_metric.compute(predictions=[predicted_transcription], references=[reference_transcription])

print(f"Predicted Transcription: {predicted_transcription}")
print(f"Reference Transcription: {reference_transcription}")
print(f"WER: {wer}")


Predicted Transcription: তেতিয়া মালিনিয়ে গৰু ওচৰ লৈ গৈ গৰু সধিলে
Reference Transcription: তেতিয়া মালিনীয়ে গৰুৰ ওচৰলৈ গৈ গৰুক সুধিলে
WER: 1.0


In [12]:
from transformers import pipeline
import evaluate
import torch

# Load the fine-tuned model using the ASR pipeline
model_path = "/content/fine_tuned_model"  # Replace with the actual directory where your model is saved

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Load the ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model=model_path, device=device)

# Load the WER metric from the evaluate library
wer_metric = evaluate.load("wer")

# Path to the audio file
audio_file = "/content/common_voice_as_35924720.mp3"  # Replace with the path to your audio file

# Reference transcription (ground truth)
reference_transcription = "তথাপি যে ছোৱালীজনীৰ মূৰ খাৰলৈ ওলাইছে, সেইটো অকল সম্পত্তিৰ লোভত।"  # Replace with the actual transcription

# Use the ASR pipeline to transcribe the audio file
result = asr_pipeline(audio_file)
predicted_transcription = result["text"]

# Calculate Word Error Rate (WER)
wer = wer_metric.compute(predictions=[predicted_transcription], references=[reference_transcription])

# Print the predicted and reference transcriptions along with the WER
print("Predicted Transcription:", predicted_transcription)
print("Reference Transcription:", reference_transcription)
print("WER:", wer)


Device set to use cpu


Predicted Transcription: তচাকিযী চোৱালী যেনৰ নৰখাৰ লৈ ওলাইছিল সেইতটো অসল সম্পগ্তেিলোব
Reference Transcription: তথাপি যে ছোৱালীজনীৰ মূৰ খাৰলৈ ওলাইছে, সেইটো অকল সম্পত্তিৰ লোভত।
WER: 1.0
