# INSTALL REQUIRED PACKAGES

In [None]:
!pip install --quiet torch torchvision torchaudio
!pip install --quiet transformers==4.45.0
!pip install --quiet librosa
!pip install --quiet soundfile
!pip install --quiet accelerate
!pip install -q huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m106.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# TEST FILE DOWNLOAD FROM HUGGINGFACE

In [None]:
from huggingface_hub import list_repo_files, hf_hub_download
import os
import shutil
repo_id = "bitwisemind/hackathon"
repo_type = "dataset"
download_dir = "./"

files = list_repo_files(repo_id=repo_id, repo_type=repo_type)
print(f"🔍 Found {len(files)} files in the repo.")

os.makedirs(download_dir, exist_ok=True)
downloaded_files = []

for file in files:
    file_path = hf_hub_download(repo_id=repo_id, filename=file, repo_type=repo_type)

    local_path = os.path.join(download_dir, file)
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    shutil.copy(file_path, local_path)

    downloaded_files.append(local_path)

print("✅ All files downloaded into:", download_dir)

!unzip ./Test.zip

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


🔍 Found 5 files in the repo.


.gitattributes: 0.00B [00:00, ?B/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Test.zip:   0%|          | 0.00/63.0M [00:00<?, ?B/s]

Train.zip:   0%|          | 0.00/289M [00:00<?, ?B/s]

Train_annotation.zip:   0%|          | 0.00/58.5k [00:00<?, ?B/s]

✅ All files downloaded into: ./
Archive:  ./Test.zip
   creating: Test/
  inflating: Test/test_001.wav       
  inflating: Test/test_002.wav       
  inflating: Test/test_003.wav       
  inflating: Test/test_004.wav       
  inflating: Test/test_005.wav       
  inflating: Test/test_006.wav       
  inflating: Test/test_007.wav       
  inflating: Test/test_008.wav       
  inflating: Test/test_009.wav       
  inflating: Test/test_010.wav       
  inflating: Test/test_011.wav       
  inflating: Test/test_012.wav       
  inflating: Test/test_013.wav       
  inflating: Test/test_014.wav       
  inflating: Test/test_015.wav       
  inflating: Test/test_016.wav       
  inflating: Test/test_017.wav       
  inflating: Test/test_018.wav       
  inflating: Test/test_019.wav       
  inflating: Test/test_020.wav       
  inflating: Test/test_021.wav       
  inflating: Test/test_022.wav       
  inflating: Test/test_023.wav       
  inflating: Test/test_024.wav       
  inflating: Tes

# IMPORT LIBRARIES

In [None]:
import os
import time
import pandas as pd
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
from tqdm import tqdm
import warnings
import gc
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

✅ Libraries imported successfully!


# SYSTEM CONFIGURATION CHECK

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n{'='*60}")
print(f"SYSTEM CONFIGURATION")
print(f"{'='*60}")
print(f"Device: {device.upper()}")

if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("⚠️ Warning: Running on CPU. Inference will be slower.")



SYSTEM CONFIGURATION
Device: CUDA
GPU: Tesla T4
Memory: 14.74 GB


# DEFINE PATHS AND CONFIGURATION

In [None]:
MODEL_NAME = "bitwisemind/bitwisemind-whisper-medium-bangla"
TEST_AUDIO_PATH = "./Test"
OUTPUT_CSV_PATH = "bitwisemind_predictions_hidden.csv"

print(f"\n{'='*60}")
print(f"CONFIGURATION")
print(f"{'='*60}")
print(f"Model: {MODEL_NAME}")
print(f"Input Path: {TEST_AUDIO_PATH}")
print(f"Output CSV: {OUTPUT_CSV_PATH}")


CONFIGURATION
Model: bitwisemind/bitwisemind-whisper-medium-bangla
Input Path: ./Test
Output CSV: bitwisemind_predictions_hidden.csv


# LOAD MODEL AND PROCESSOR

In [None]:
print(f"\n{'='*60}")
print(f"LOADING MODEL")
print(f"{'='*60}")

# Start timing for model loading
model_load_start = time.time()

try:
    # Clear cache before loading
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print(f"📦 Loading model from HuggingFace Hub...")
    print(f"   (First run will download model, subsequent runs use cache)")

    # Load processor and model
    processor = WhisperProcessor.from_pretrained(MODEL_NAME)
    model = WhisperForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        low_cpu_mem_usage=True
    )

    # Remove language/task forcing
    if hasattr(model.generation_config, 'language'):
        delattr(model.generation_config, 'language')
    if hasattr(model.generation_config, 'task'):
        delattr(model.generation_config, 'task')
    model.generation_config.forced_decoder_ids = None

    # Move to device and set to eval mode
    model.to(device)
    model.eval()

    model_load_end = time.time()
    model_load_time = model_load_end - model_load_start

    print(f"✅ Model loaded successfully!")
    print(f"⏱️ Model loading time: {model_load_time:.2f} seconds")

    # Clear cache after loading
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

except Exception as e:
    print(f"❌ Error loading model: {e}")
    raise


LOADING MODEL
📦 Loading model from HuggingFace Hub...
   (First run will download model, subsequent runs use cache)


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/223 [00:00<?, ?B/s]

✅ Model loaded successfully!
⏱️ Model loading time: 47.33 seconds


# SCAN AUDIO FILES

In [None]:
print(f"\n{'='*60}")
print(f"SCANNING AUDIO FILES")
print(f"{'='*60}")

audio_files = []
if os.path.exists(TEST_AUDIO_PATH):
    all_files = sorted(os.listdir(TEST_AUDIO_PATH))
    audio_files = [f for f in all_files if f.lower().endswith('.wav')]

    if len(audio_files) > 0:
        print(f"✅ Found {len(audio_files)} audio files")
        print(f"   First: {audio_files[0]}")
        print(f"   Last: {audio_files[-1]}")
    else:
        print("⚠️ No .wav files found in the test folder")
        print("   Please ensure your audio files have .wav extension")
else:
    print(f"❌ Error: Test folder not found at {TEST_AUDIO_PATH}")
    raise FileNotFoundError(f"Directory {TEST_AUDIO_PATH} does not exist")



SCANNING AUDIO FILES
✅ Found 450 audio files
   First: test_001.wav
   Last: test_450.wav


# DEFINE TRANSCRIPTION FUNCTION

In [None]:
def transcribe_audio(audio_path, processor, model, device):
    """
    Transcribe a single audio file using the fine-tuned Whisper model.

    Args:
        audio_path: Path to the audio file
        processor: Whisper processor
        model: Fine-tuned Whisper model
        device: Device to run inference on

    Returns:
        Transcription text
    """
    try:
        # Load and preprocess audio
        audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
        input_features = processor(
            audio_array,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features.to(device)

        # Generate transcription
        with torch.no_grad():
            predicted_ids = model.generate(
                input_features,
                max_length=225,
                num_beams=5,
            )

        # Decode transcription
        transcription = processor.batch_decode(
            predicted_ids,
            skip_special_tokens=True
        )[0]

        return transcription.strip()

    except Exception as e:
        print(f"   ⚠️ Error: {os.path.basename(audio_path)} - {str(e)}")
        return ""

# PROCESS ALL AUDIO FILES (INFERENCE)

In [None]:
print(f"\n{'='*60}")
print(f"INFERENCE STARTED")
print(f"{'='*60}")

results = []
failed_count = 0

# Start timing inference only
inference_start = time.time()

# Process each audio file with tqdm progress bar
with tqdm(total=len(audio_files), desc="Transcribing", unit="file", ncols=100,
          bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]') as pbar:

    for idx, audio_file in enumerate(audio_files):
        audio_path = os.path.join(TEST_AUDIO_PATH, audio_file)

        # Validate file
        if not os.path.exists(audio_path) or os.path.getsize(audio_path) == 0:
            failed_count += 1
            pbar.update(1)
            continue

        # Transcribe audio
        transcription = transcribe_audio(audio_path, processor, model, device)

        # Store result
        results.append({
            "audio": audio_file,
            "text": transcription
        })

        # Update progress bar
        pbar.update(1)

        # Periodic memory cleanup (every 50 files)
        if (idx + 1) % 50 == 0:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

# End timing
inference_end = time.time()
inference_time = inference_end - inference_start

# Final memory cleanup
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(f"\n{'='*60}")
print(f"INFERENCE COMPLETED")
print(f"{'='*60}")
print(f"✅ Successfully transcribed: {len(results)} files")
print(f"⏱️ Total Inference Time: {inference_time:.2f} seconds ({inference_time/60:.2f} minutes)")
if len(results) > 0:
    print(f"⚡ Average time per file: {inference_time/len(results):.2f} seconds")
if failed_count > 0:
    print(f"⚠️ Failed/Skipped: {failed_count} files")



INFERENCE STARTED


Transcribing:   0%|                                                                | 0/450 [00:00<?]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Transcribing: 100%|██████████████████████████████████████████████████████████| 450/450 [16:37<00:00]



INFERENCE COMPLETED
✅ Successfully transcribed: 450 files
⏱️ Total Inference Time: 997.21 seconds (16.62 minutes)
⚡ Average time per file: 2.22 seconds


# SAVE RESULTS TO CSV

In [None]:
print(f"\n{'='*60}")
print(f"SAVING RESULTS")
print(f"{'='*60}")

if len(results) == 0:
    print("❌ No transcriptions generated.")
    print("   Please check your audio files and model configuration.")
else:
    df = pd.DataFrame(results)
    df = df[["audio", "text"]]

    # Save to CSV
    df.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8-sig')

    print(f"✅ CSV file saved: {OUTPUT_CSV_PATH}")
    print(f"   Total records: {len(df)}")
    print(f"   Columns: {', '.join(df.columns)}")



SAVING RESULTS
✅ CSV file saved: bitwisemind_predictions_hidden.csv
   Total records: 450
   Columns: audio, text


# DISPLAY STATISTICS

In [None]:
if len(results) > 0:
    print(f"\n{'='*60}")
    print(f"STATISTICS")
    print(f"{'='*60}")

    empty_count = df['text'].isna().sum() + (df['text'] == '').sum()
    valid_count = len(df) - empty_count

    print(f"Total files processed: {len(df)}")
    print(f"Valid transcriptions: {valid_count}")
    print(f"Empty transcriptions: {empty_count}")

    if valid_count > 0:
        valid_df = df[df['text'].str.len() > 0]
        avg_len = valid_df['text'].str.len().mean()
        min_len = valid_df['text'].str.len().min()
        max_len = valid_df['text'].str.len().max()

        print(f"\nTranscription length:")
        print(f"   Average: {avg_len:.0f} characters")
        print(f"   Range: {min_len} - {max_len} characters")



STATISTICS
Total files processed: 450
Valid transcriptions: 450
Empty transcriptions: 0

Transcription length:
   Average: 29 characters
   Range: 15 - 65 characters


# DISPLAY SAMPLE TRANSCRIPTIONS

In [None]:
if len(results) > 0:
    print(f"\n{'='*60}")
    print(f"SAMPLE TRANSCRIPTIONS")
    print(f"{'='*60}")

    # Show first 3 transcriptions
    for idx, row in df.head(3).iterrows():
        print(f"\n{idx+1}. {row['audio']}")
        text_preview = row['text'][:100] + '...' if len(row['text']) > 100 else row['text']
        print(f"   {text_preview}")


SAMPLE TRANSCRIPTIONS

1. test_001.wav
   তুমি কি খেলতে যাচ্ছো?

2. test_002.wav
   তুমি কি আমাকে কলমটা দেবে?

3. test_003.wav
   আজ দুপুরে রাস্তায় পানি জমেছিল।
