In [None]:
! pip install transformers datasets 
! pip install torchaudio
! pip install librosa

In [None]:
import torch
print(torch.cuda.is_available())  # Should return True if GPUs are available
print(torch.cuda.device_count())  # Number of available GPUs

In [2]:
import pandas as pd
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("ganga4364/Dilgo-Khyentse-Rinpoche-dataset")

In [3]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments

# Load pre-trained Wav2Vec2 model and processor
model_name = "ganga4364/mms_300_v4.96000"
processor = Wav2Vec2Processor.from_pretrained(model_name)

In [8]:
from torchaudio.transforms import Resample
import torchaudio

def speech_file_to_array_fn(batch):
    # print(batch)
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    # print(speech_array.shape, sampling_rate)
    if sampling_rate != 16000:
        print("resampling")
        resampler = Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)
        sampling_rate = 16000
    
    # print(speech_array.shape, sampling_rate)
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["uni"]
    return batch

In [9]:
import numpy as np
def prepare_dataset(batch):
    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"]).input_values
    # reshape to (n,)
    batch["input_values"] = np.squeeze(batch["input_values"])
    # if batch["sampling_rate"] != 16000:
    #     print("sampling rate not 16k", batch)
    
    # with processor.as_target_processor():
    #     batch["labels"] = processor(batch["target_text"]).input_ids

    batch["labels"] = processor(text=batch["target_text"]).input_ids
    return batch

In [10]:
dataset_train = dataset['train']
dataset_val = dataset['validation']


In [None]:
dataset_train

In [12]:
# Assuming dataset_test and dataset_val are datasets.Dataset objects
dataset_val = dataset_val.map(lambda x: {'path': f"/data/volume/wav_16k/{x['file_name']}.wav"})
dataset_train = dataset_train.map(lambda x: {'path': f"/data/volume/wav_16k/{x['file_name']}.wav"})


In [None]:
dataset_train[0]

In [15]:
import pandas as pd 

df = pd.read_csv('04_bad_apples.csv')

In [None]:
# Step 1: Get the list of filenames to remove
filenames_to_remove = df['file_name'].tolist()  # Your dataframe containing the filenames to remove

# Step 2: Define a function to check if a file_name starts with any of the filenames to remove
def should_keep(example):
    return not any(example['file_name'].startswith(f) for f in filenames_to_remove)

# Step 3: Use the map function to filter the dataset
dataset_train = dataset_train.filter(should_keep)
dataset_val = dataset_val.filter(should_keep)

# Optional: Save the filtered dataset
# dataset_train_filtered.save_to_disk('filtered_dataset')  # Uncomment to save it to disk


In [18]:
dataset_train = dataset_train.map(speech_file_to_array_fn, remove_columns=dataset_train.column_names)
dataset_train = dataset_train.map(prepare_dataset, remove_columns=dataset_train.column_names)

In [None]:
dataset_train.save_to_disk('/data/volume/wav2vec2/train_prepare_dataset.arrow')

In [None]:
# Use num_proc=3 to enable multiprocessing
dataset_val = dataset_val.map(speech_file_to_array_fn, remove_columns=dataset_val.column_names, num_proc=5)
dataset_val = dataset_val.map(prepare_dataset, remove_columns=dataset_val.column_names, num_proc=5)
# Save the training, validation, and test datasets to disk
dataset_val.save_to_disk('/data/volume/wav2vec2/val_prepare_dataset.arrow')