In [None]:
# !gdown https://drive.google.com/uc?id=1LRO9eSJazJ9WDjjuOu2gcUh07FNwsE8I

In [None]:
# !unzip checkpoint-8000.zip

In [None]:
# fileid = '1XbQCdf2piJYSgEmxcRBXCDr_ZDykSr9W'

In [None]:
# !gdown --id fileid -O data.zip

# Downloading the Dataset. gdown makes it easier to download data when data is in google drive


In [None]:
# downloading the files in the runtime
!gdown https://drive.google.com/uc?id=1l-kq6-KGOt4mG3OvfamzkotMjRheP2Sh

In [None]:
!rm -r dataset/

### Unzipping the file

In [None]:
!unzip dataset.zip -d dataset/

# Installing Necessary Packages

In [None]:
# installing necessary pacakges
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]
!pip install jiwer
!pip install transformers==4.42.3
!pip install tensorboard
!pip install torch torchvision torchaudio


# Importing All the Packages

In [None]:
# importing necessary pacakges
import pandas as pd
import torchaudio
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer
import numpy as np
import torch

# Changing the Path Based on the path according to the location of the drive.

In [None]:
# laoding CSV file that contains relative path and Transcriptions
csv_file_path = '/content/dataset/dataset_detail.csv'
df = pd.read_csv(csv_file_path)

df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# dropping null values
df = df.dropna(subset=['Transcriptions'])

# changing relative path to absolute path
directory_path = "/content/dataset/"



In [None]:
to_remove = ['01f71cec1d.wav',
 '01fbdfe8ca.wav',
 '04ae522dfc.wav',
 '051c4e3e24.wav',
 '057c3d0e25.wav',
 '05a5716243.wav',
 '08323d719c.wav',
 '10387db7e0.wav',
 '12259dcba4.wav',
 '136bc20fe4.wav',
 '139b690b70.wav',
 '151fb2a8e4.wav',
 '15afdea887.wav',
 '15eec3a40a.wav',
 '15fe8ea284.wav',
 '1847a4298a.wav',
 '1b7145cdf5.wav',
 '1bb3be4e96.wav',
 '1c0975c58d.wav',
 '1c1a59b5f3.wav',
 '1c455ae232.wav',
 '1c687a5c26.wav',
 '1d17425ea4.wav',
 '1e812de253.wav',
 '1f5d72d3b5.wav',
 '1fc01decd0.wav',
 '2282ffca23.wav',
 '22c94b7e72.wav',
 '25187ac404.wav',
 '296d0cb877.wav',
 '2a072732e1.wav',
 '2b224b6039.wav',
 '2ebbb58188.wav',
 '2f129fb2e8.wav',
 '2f576e2987.wav',
 '2f590de05c.wav',
 '32913511ce.wav',
 '337b4e94ac.wav',
 '33a693fff1.wav',
 '343d6de904.wav',
 '347c7b330e.wav',
 '36620057c1.wav',
 '3ba92920d8.wav',
 '3c7458aa2d.wav',
 '3cc1d55bd9.wav',
 '3df39d8d40.wav',
 '3f05d05549.wav',
 '3ff9557310.wav',
 '426c2599ae.wav',
 '4405236e0f.wav',
 '4515570fb0.wav',
 '45497ed527.wav',
 '4556010d4c.wav',
 '4593271b99.wav',
 '4783c4bd88.wav',
 '49ed390b25.wav',
 '4c68fc3fd0.wav',
 '4d01514a53.wav',
 '4d4df321a9.wav',
 '4ef2053f92.wav',
 '4f2721693c.wav']

In [None]:
df = df[~df['Audio'].isin(to_remove)]

In [None]:
df['Audio'] = df['Audio'].apply(lambda x: directory_path + x)

In [None]:
df[df['Audio'] == '/content/dataset/01f71cec1d.wav']

# Creating the dataset object from the dataframe

In [None]:
# creating a dataset object
dataset = Dataset.from_pandas(df.iloc[17000:21000])
dataset

# Creating the Vocabulary

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["Transcriptions"])
    vocab = list(set(all_text))
    return {"vocab": [vocab]}

vocab_all = dataset.map(extract_all_chars, batched=True,
                        batch_size=-1, keep_in_memory=True,
                        remove_columns=dataset.column_names)


In [None]:
vocab_all

In [None]:
vocab_list = sorted(list(set(vocab_all["vocab"][0])))

In [None]:
vocab_list

In [None]:
# adding unknown and padding tokens -> needed when unknown characters is encounterd.
# -> Padding token is added to make the length equal.

UNK_TOKEN = '__UNK__'
PAD_TOKEN = '__PAD__'

vocab_list = [PAD_TOKEN, UNK_TOKEN, *vocab_list]

In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

# for printing vocab in single line
', '.join([f"{k}: {v}" for k, v in (vocab_dict.items())])

In [None]:
WORD_DELIMITER = '|'

vocab_dict[WORD_DELIMITER] = vocab_dict[" "]
del vocab_dict[" "]
len(vocab_dict)


In [None]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

# Creating the tokenizer from the vocabulary just creating

In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, word_delimiter_token=WORD_DELIMITER)

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000,
                                             padding_value=0.0, do_normalize=True,
                                             return_attention_mask=True)


In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)


In [None]:
dataset[45]

# Importing the model

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)


In [None]:
# from transformers import Wav2Vec2Processor

# processor = Wav2Vec2Processor.from_pretrained('spktsagar/wav2vec2-large-xls-r-300m-nepali-openslr')

In [None]:
# from transformers import Wav2Vec2ForCTC

# model = Wav2Vec2ForCTC.from_pretrained("spktsagar/wav2vec2-large-xls-r-300m-nepali-openslr",ctc_loss_reduction="mean",pad_token_id=processor.tokenizer.pad_token_id)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
model = model.to(device)

In [None]:
# with open('file.txt', 'w') as file:
    # file.write(str(model))
    # file.close()

In [None]:
df['Audio']

In [None]:
print(dataset.column_names)


# Creating the batch of dataset.

In [None]:
import os

def speech_file_to_array_fn(batch):
    file_path = batch['Audio']
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Audio file not found: {file_path}")
    speech_array, sampling_rate = torchaudio.load(batch["Audio"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["Transcriptions"]
    return batch

def prepare_dataset(batch):
    batch = speech_file_to_array_fn(batch)
    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"]).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch



In [None]:
dataset

In [None]:
# Save the updated DataFrame to a new CSV file
# df.to_csv(output_csv, index=False)

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

In [None]:
dataset

### Splitting the data
Splitting the data 80% Training 20% Testing of total data \\
Splitting the data 90% Training and 10% Validation of Training data

In [None]:
# splitting the dataset into 80% train and 20% test sets
train_test_split = dataset.train_test_split(test_size=0.2, shuffle = True)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Split the training dataset into 90% training and 10% validation
train_eval_split = train_dataset.train_test_split(test_size=0.1, shuffle = True)
train_dataset = train_eval_split['train']
val_dataset = train_eval_split['test']

In [None]:
train_dataset[0]['target_text']

In [None]:
test_dataset

In [None]:
val_dataset

In [None]:
# model.freeze_feature_encoder()


# Defining the metrics

In [None]:
import numpy as np
import jiwer
from datasets import load_metric

wer_metric = load_metric("wer", trust_remote_code=True)

predictions_list = []
references_list = []

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids)
    pred_str = [s.replace(processor.tokenizer.pad_token, '') for s in pred_str]

    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)

    predictions_list.extend(pred_str)
    references_list.extend(label_str)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Defining the Hyper-Parameters

In [None]:
# defining the training arguments
training_args = TrainingArguments(
  output_dir="./wav2vec2-nepali-asr",
  group_by_length=True,
  per_device_train_batch_size=2,
  per_device_eval_batch_size=2,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  save_strategy="steps",
  max_steps= 1000,
  fp16=True,
  num_train_epochs = 1,
  save_steps=100,
  eval_steps=100,
  logging_steps=100,
  learning_rate=1e-5,
  warmup_steps=100,
  save_total_limit=2,
  remove_unused_columns=False,
  logging_dir='./logs',
  resume_from_checkpoint='./wav2vec2-nepali-asr',
  load_best_model_at_end = True,
  metric_for_best_model= 'wer',
  greater_is_better = False,
)

In [None]:
from dataclasses import dataclass
from typing import Union, Optional, List, Dict

import torch
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Defining the Tensor Board

In [None]:
from torch.utils.tensorboard import SummaryWriter
import os

log_dir = './logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

writer = SummaryWriter(log_dir)

In [None]:
from transformers import TrainerCallback
import csv


class CustomTensorBoardCallback(TrainerCallback):
    def __init__(self, writer, metric, log_dir='./logs'):
        self.writer = writer
        self.metric = metric
        self.log_dir = log_dir
        self.training_loss = []
        self.validation_loss = []
        self.epoch_training_loss = []
        self.epoch_validation_loss = []
        self.wer = []
        self.learning_rate_data = []
        self.steps = []

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        self.training_file = os.path.join(log_dir, "training_loss.csv")
        self.validation_file = os.path.join(log_dir, "validation_loss.csv")
        self.epoch_training_file = os.path.join(log_dir, "epoch_training_loss.csv")
        self.epoch_validation_file = os.path.join(log_dir, "epoch_validation_loss.csv")
        self.wer_file = os.path.join(log_dir, "wer_record.csv")
        self.learning_rate_file = os.path.join(log_dir, "learning_rate_file.csv")

    def on_evaluate(self, args, state, control, **kwargs):
        # Log WER to TensorBoard
        eval_results = kwargs.get('metrics')
        if eval_results and "wer" in eval_results:
            self.writer.add_scalar("Validation/WER", eval_results["wer"], state.global_step)
        if eval_results and "eval_loss" in eval_results:
            self.writer.add_scalar("Validation/Loss", eval_results["eval_loss"], state.global_step)
            self.validation_loss.append((state.global_step, eval_results["eval_loss"]))
            with open(self.validation_file, "a") as f:
                writer = csv.writer(f)
                writer.writerow([state.global_step, eval_results["eval_loss"]])

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(logs)
            if "eval_wer" in logs:
                self.steps.append(state.global_step)
                self.wer.append(logs["eval_wer"])

                with open(self.wer_file, "a") as f:
                    writer = csv.writer(f)
                    writer.writerow([state.global_step, logs['eval_wer']])

            if 'learning_rate' in logs:
                self.learning_rate_data.append(logs['learning_rate'])
                self.writer.add_scalar('Training/LearningRate', logs['learning_rate'], state.global_step)
                with open(self.learning_rate_file, "a") as f:
                    writer = csv.writer(f)
                    writer.writerow([state.global_step, logs["learning_rate"]])

            if 'loss' in logs:
                self.writer.add_scalar('Training/Loss', logs['loss'], state.global_step)
                self.training_loss.append((state.global_step, logs['loss']))

                with open(self.training_file, "a") as f:
                    writer = csv.writer(f)
                    writer.writerow([state.global_step, logs['loss']])
            for key, value in logs.items():
                if key not in ['loss']:  # Avoid logging the training loss twice
                    self.writer.add_scalar(f'Training/{key}', value, state.global_step)
tensorboard_callback = CustomTensorBoardCallback(writer, wer_metric)

# Initializing the Trainer

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.feature_extractor,
    callbacks=[tensorboard_callback]
)



In [None]:
torch.cuda.empty_cache()

In [None]:
!ls

In [None]:
device.type

# Freezing the feature Extractor

In [None]:
model.freeze_feature_extractor()

# Training Begins

In [None]:
trainer.train()

In [None]:
!ls

In [None]:
!zip -r logs.zip logs/


In [None]:
!zip -r checkpoint.zip wav2vec2-nepali-asr/checkpoint-16000/

In [None]:
# analyzing top word that are errors
from collections import Counter


pred_tokens = [token for sentence in predictions_list for token in sentence.split()]
ref_tokens = [token for sentence in references_list for token in sentence.split()]


errors = [ref for ref, pred in zip(ref_tokens, pred_tokens) if ref != pred]

error_counts = Counter(errors)

top_errors = error_counts.most_common(10)  # Adjust the number as needed

print("Top Error Words and Their Frequencies:")
with open('training_top_error_rate.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Word", "Frequency"])
    for word, count in top_errors:
        writer.writerow([word, count])
        print(f"{word}: {count}")


In [None]:
!pip install seaborn

In [None]:
import csv
from collections import Counter

true_labels = references_list
predicted_labels = predictions_list


true_tokens = [token for sentence in true_labels for token in sentence.split()]
pred_tokens = [token for sentence in predicted_labels for token in sentence.split()]


errors = [(true, pred) for true, pred in zip(true_tokens, pred_tokens) if true != pred]

error_counts = Counter(errors)

top_errors = error_counts.most_common(10)

print("Top Error Words and Their Frequencies:")

with open('training_top_error_rate_frequency.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["True Word", "Predicted Word", "Count"])
    for (true_word, pred_word), count in top_errors:
        writer.writerow([true_word, pred_word, count])
        print(f"True: {true_word} | Predicted: {pred_word} | Count: {count}")


In [None]:
%load_ext tensorboard
%tensorboard --logdir ./logs


In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(tensorboard_callback.steps, tensorboard_callback.wer, label='WER')
plt.xlabel('Steps')
plt.ylabel('WER')
plt.title('WER vs. Steps')
plt.legend()
plt.savefig('wer vs steps.png')
plt.show()




In [None]:
# plt.figure(figsize=(10, 5))
# plt.plot(tensorboard_callback.steps, tensorboard_callback.learning_rate_data, label='Learning Rate')
# plt.xlabel('Steps')
# plt.ylabel('Learning Rate')
# plt.title('Learning Rate vs. Steps')
# plt.savefig('learning_rate vs steps.png')
# plt.legend()
# plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


training_loss_df = pd.read_csv('./logs/training_loss.csv', header=None, names=['step', 'loss'])
validation_loss_df = pd.read_csv('./logs/validation_loss.csv', header=None, names=['step', 'loss'])
# epoch_training_loss_df = pd.read_csv('./logs/epoch_training_loss.csv', header=None, names=['epoch', 'loss'])
# epoch_validation_loss_df = pd.read_csv('./logs/epoch_validation_loss.csv', header=None, names=['epoch', 'loss'])


plt.figure(figsize=(10, 5))
plt.plot(training_loss_df['step'], training_loss_df['loss'], label='Training Loss')
plt.plot(validation_loss_df['step'], validation_loss_df['loss'], label='Validation Loss')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training and Validation Loss per Step')
plt.legend()
plt.savefig('training and validation.png')
plt.show()

# Plot training and validation loss per epoch
# plt.figure(figsize=(10, 5))
# plt.plot(epoch_training_loss_df['epoch'], epoch_training_loss_df['loss'], label='Training Loss')
# plt.plot(epoch_validation_loss_df['epoch'], epoch_validation_loss_df['loss'], label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training and Validation Loss per Epoch')
# plt.legend()
# plt.show()


In [None]:
!ls

In [None]:
!unzip ./satisfactory.zip

In [None]:
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test WER: {test_results['eval_wer']:.4f}")

In [None]:
# model.freeze_feature_encoder()

In [None]:
!pip install jiwer

# Testing on Test Dataset

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch
from jiwer import wer
from safetensors import safe_open
from collections import Counter
import difflib

model_path = 'spktsagar/wav2vec2-large-xls-r-300m-nepali-openslr'
checkpoint = './checkpoint-2760/model.safetensors'

processor = Wav2Vec2Processor.from_pretrained(model_path)
with safe_open(checkpoint, framework="pt") as f:
    model_state_dict = {key: torch.tensor(f.get_tensor(key)) for key in f.keys()}
model = Wav2Vec2ForCTC.from_pretrained(model_path, state_dict=model_state_dict)
model.eval()


def transcribe_audio(waveform):
    input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription


def get_detailed_errors(reference, hypothesis):
    reference_words = reference.split()
    hypothesis_words = hypothesis.split()
    sm = difflib.SequenceMatcher(None, reference_words, hypothesis_words)

    detailed_errors = {
        'substitutions': [],
        'insertions': [],
        'deletions': []
    }

    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == 'replace':
            detailed_errors['substitutions'].extend(zip(reference_words[i1:i2], hypothesis_words[j1:j2]))
        elif tag == 'insert':
            detailed_errors['insertions'].extend(hypothesis_words[j1:j2])
        elif tag == 'delete':
            detailed_errors['deletions'].extend(reference_words[i1:i2])

    return detailed_errors


confusion_matrix = Counter()
total_wer = 0
total_examples = 0



for example in test_dataset:
    predicted_text = transcribe_audio(example['speech'])
    reference_text = example["target_text"]


    error_rate = wer(reference_text, predicted_text)
    total_wer += error_rate
    total_examples += 1


    detailed_errors = get_detailed_errors(reference_text, predicted_text)


    for ref_word, hyp_word in detailed_errors['substitutions']:
        confusion_matrix[(ref_word, hyp_word)] += 1
    for ins_word in detailed_errors['insertions']:
        confusion_matrix[('<ins>', ins_word)] += 1
    for del_word in detailed_errors['deletions']:
        confusion_matrix[(del_word, '<del>')] += 1

In [None]:
total_examples = len(test_dataset)
substitutions = sum(1 for (ref, hyp) in confusion_matrix if ref != '<ins>' and hyp != '<del>')
deletions = sum(1 for (ref, hyp) in confusion_matrix if hyp == '<del>')
insertions = sum(1 for (ref, hyp) in confusion_matrix if ref == '<ins>')

average_wer = total_wer / total_examples
print(f"Overall WER: {average_wer:.4f}")

print(f"Total examples: {total_examples}")
print(f"Substitutions: {substitutions}")
print(f"Deletions: {deletions}")
print(f"Insertions: {insertions}")


print("\nTop confused words:")
for (ref_word, hyp_word), freq in confusion_matrix.most_common(10):
    print(f"{ref_word} -> {hyp_word}: {freq}")

In [None]:
test_dataset

In [None]:
import torch
from torch.utils.data import DataLoader

# Example dataset and DataLoader (replace with your actual dataset and DataLoader)
class ExampleDataset(torch.utils.data.Dataset):
    def __init__(self):
        self.data = ["यो पहिलो वाक्य हो।", "यहाँ अर्को वाक्य छ।", "र अन्तिम वाक्य।"]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create the dataset and dataloader
test_dataset = ExampleDataset()
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Initialize a list to hold all the target text
all_texts = []

# Use a while loop to iterate through the DataLoader
iterator = iter(test_dataloader)
while True:
    try:
        # Get the next batch (in this case, one sentence at a time)
        batch = next(iterator)
        # Append the batch text to the list
        all_texts.append(batch[0])
    except StopIteration:
        # If we've reached the end of the DataLoader, break the loop
        break

# Concatenate all sentences into one string
all_text = ' '.join(all_texts)

# Split the concatenated string into words
words = all_text.split()

# Count the number of words
word_count = len(words)

print(f"Number of words in test_dataset['target_text']: {word_count}")


In [None]:
all_text

In [None]:
words = all_text.split()

# Count the number of words
word_count = len(words)

In [None]:
word_count

In [None]:
def transcribe_audio(audio_file):
    speech, sample_rate = torchaudio.load(audio_file)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        speech = resampler(speech)

    inputs = processor(speech.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)

    # Move inputs to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Put the model in evaluation mode and move it to the device
    # model= trained.to(device)
    model.eval()
    # print(model.eval())

    # Make predictions
    with torch.no_grad():
        logits = model(**inputs).logits
        print(logits)

    # Decode the predictions
    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids)

    print(transcription, pred_ids)
    return transcription[0]

In [None]:

# Example usage
audio_file = "./data/Voice10.wav"
transcription = transcribe_audio(audio_file)
print(f"Transcription: {transcription}")