In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data prepration:

In [None]:
import os
import pandas as pd

def generate_audio_text_files(directory_path, output_directory, csv_path):
    # Load the transcription CSV
    try:
        transcription_df = pd.read_csv(csv_path)
        transcription_dict = dict(zip(transcription_df['Id'], transcription_df['Transcription']))
        print(f"Loaded {len(transcription_dict)} transcriptions from CSV.")
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return

    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)
    print(f"Output directory created at: {output_directory}")

    # Open the files to write paths and text content
    with open(os.path.join(output_directory, 'audio_paths'), 'w') as audio_file, \
         open(os.path.join(output_directory, 'text'), 'w') as text_file:

        file_count = 0  # Track the number of processed files
        examples = []  # Store a few examples to print later

        # Loop through all files in the directory
        for filename in sorted(os.listdir(directory_path)):
            if filename.endswith('.wav'):
                # Extract numerical ID from filename
                utt_id = os.path.splitext(filename)[0]  # removes '.wav' extension
                file_path = os.path.join(directory_path, filename)

                # Get transcription for this utt_id from the dictionary
                transcription = transcription_dict.get(int(utt_id))

                if transcription is None:
                    # Delete the file if transcription is not found
                    print(f"No transcription found for file {filename} (ID: {utt_id}). Deleting file.")
                    os.remove(file_path)
                    continue

                # Write to audio_paths file with full path
                audio_file.write(f"{utt_id} {file_path}\n")

                # Write to text file
                text_file.write(f"{utt_id} {transcription}\n")

                # Store example
                if file_count < 5:  # Only store the first few examples
                    examples.append((utt_id, file_path, transcription))

                # Print status
                print(f"Processed file {filename}: {utt_id} -> {file_path}")
                file_count += 1

        if file_count == 0:
            print("No .wav files were processed.")
        else:
            print(f"Total .wav files processed: {file_count}")

            # Print a few examples
            print("\nExamples of processed files:")
            for example in examples:
                print(f"ID: {example[0]}, Path: {example[1]}, Transcription: {example[2]}")

# Usage
input_directory = '/content/drive/MyDrive/lifecycle/281474976710731'
output_directory = '/content/drive/MyDrive/sample_data_lifecycle'
csv_path = '/content/drive/MyDrive/Braj_lifecycle(3).csv'
generate_audio_text_files(input_directory, output_directory, csv_path)


Loaded 400 transcriptions from CSV.
Output directory created at: /content/drive/MyDrive/sample_data_lifecycle
Processed file 281474976733315.wav: 281474976733315 -> /content/drive/MyDrive/lifecycle/281474976710731/281474976733315.wav
Processed file 281474976733316.wav: 281474976733316 -> /content/drive/MyDrive/lifecycle/281474976710731/281474976733316.wav
Processed file 281474976733317.wav: 281474976733317 -> /content/drive/MyDrive/lifecycle/281474976710731/281474976733317.wav
Processed file 281474976733318.wav: 281474976733318 -> /content/drive/MyDrive/lifecycle/281474976710731/281474976733318.wav
Processed file 281474976733319.wav: 281474976733319 -> /content/drive/MyDrive/lifecycle/281474976710731/281474976733319.wav
Processed file 281474976733320.wav: 281474976733320 -> /content/drive/MyDrive/lifecycle/281474976710731/281474976733320.wav
Processed file 281474976733321.wav: 281474976733321 -> /content/drive/MyDrive/lifecycle/281474976710731/281474976733321.wav
Processed file 2814749

In [None]:
!python /content/drive/MyDrive/data_prep.py --source_data_dir /content/drive/MyDrive/sample_data_lifecycle --output_data_dir /content/drive/MyDrive/sample_data_lifecycle_1

Casting the dataset: 100% 400/400 [00:00<00:00, 106003.77 examples/s]
Saving the dataset (1/1 shards): 100% 400/400 [00:05<00:00, 73.59 examples/s]
Data preparation done


fine tune:

In [None]:
!pip install jiwer


Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.5-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.5 rapidfuzz-3.10.1


In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [None]:
!pip install dataset

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.54-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting Mako (from alembic>=0.6.2->dataset)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Downloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Downloading SQLAlchemy-1.4.54-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux

In [None]:
!pip install bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [None]:
ngpu=1  # Set to 1 for single-GPU training in Colab

!torchrun --nproc_per_node=$ngpu /content/drive/MyDrive/just_dw1.py \
--model_name vasista22/whisper-hindi-small \
--language Hindi \
--sampling_rate 16000 \
--num_proc 2 \
--train_strategy epoch \
--learning_rate 3e-3 \
--warmup 1000 \
--train_batchsize 8 \
--eval_batchsize 8 \
--num_epochs 20 \
--resume_from_ckpt None \
--output_dir /content/drive/MyDrive/dir_op \
--train_datasets /content/drive/MyDrive/sample_data_1/ \
--eval_datasets /content/drive/MyDrive/sample_data_lifecycle_1


2024-11-12 12:31:15.486683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-12 12:31:15.509247: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-12 12:31:15.515212: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-12 12:31:15.529682: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


+++++++++++++++++++++++++++++++++++++++++++

Single audio transcrption:

In [None]:
!python3 /content/drive/MyDrive/transcribe_audio2.py \
--is_public_repo False \
--ckpt_dir "/content/drive/MyDrive/openai-whisper-small-braj/checkpoint-2640" \
--temp_ckpt_folder "temp" \
--path_to_audio /content/drive/MyDrive/translation/281474976710730/281474976714320.wav \
--language "hi" \
--device 0


2024-11-13 16:49:34.052483: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-13 16:49:34.082534: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-13 16:49:34.092073: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-13 16:49:34.113898: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Transcription: 
The attention mask is not set