In [None]:
%%capture
!pip install jiwer  datasets librosa pandas pyarrow

In [None]:
import pandas as pd
import jiwer 
import time
import multiprocessing as mp
from datasets import load_dataset, Dataset, Audio, Value, Features
import gc
import psutil
import os

In [None]:
# Set environment variables for better performance
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"

# Optimize pandas and numpy
pd.set_option('mode.copy_on_write', True)

In [None]:
gc.set_threshold(100, 10, 10)

In [None]:
# Optimized features definitions
features = Features({
    "id": Value("string"),
    "audio": Audio(), 
    "audio_language": Value("string"),                   
    "text": Value("string"),
    "transcription": Value("string"),
    "wer": Value("float32"),                       
    "wer_range": Value("string"),                 
    "prompt": Value("string"),   
    "duration": Value("float32"),        
    "speaker_id": Value("string")
})

features1 = Features({
    "id": Value("string"),
    "audio": Audio(), 
    "audio_language": Value("string"),                   
    "text": Value("string"),              
    "prompt": Value("string"),   
    "duration": Value("float32"),        
    "speaker_id": Value("string")
})

In [None]:
csv_file = "/kin/whisper_transcriptions.csv"
HF_TOKEN = "" #huggingface token or you can login

In [None]:
start_time = time.time()

In [None]:
df = pd.read_csv(csv_file, 
                 dtype={'id': 'string', 'transcription': 'string'},
                 engine='c')
df = df.set_index('id')
df['id'] = df.index

In [None]:
df_dict = df.to_dict('index')
df_ids = set(df['id'].unique())

In [None]:
del df
gc.collect()

In [None]:
print(f"CSV loaded in {time.time() - start_time:.2f} seconds")
print(f"DataFrame shape: {df.shape}")

In [None]:
print("Loading dataset...")
start_time = time.time()

dataset = load_dataset('jq/kinyarwanda-speech-hackathon', 
                      split='train', 
                      token=HF_TOKEN,
                      num_proc=min(32, mp.cpu_count())) 
print(f"Dataset loaded in {time.time() - start_time:.2f} seconds")

In [None]:
def filter_by_df_ids_batch(examples):
    """Filter function that works on batches - much more memory efficient"""
    keep_mask = [example_id in df_ids for example_id in examples['id']]
    return keep_mask

print("Filtering with memory-efficient batching...")
start_time = time.time

In [None]:
filtered_ds = dataset.filter(
    filter_by_df_ids_batch, 
    batched=True,
    batch_size=2000,  
    num_proc=1,       # SINGLE PROCESS - no memory multiplication
    desc="Filtering dataset"
)

In [None]:
def calculate_wer_batch(examples):
    """
    Calculate WER for a batch of examples - much faster than individual processing
    """
    batch_size = len(examples['id'])
    wers = []
    wer_ranges = []
    transcriptions = []
    
    for i in range(batch_size):
        try:
            example_id = examples['id'][i]
            
            # Fast dictionary lookup
            if example_id in df_dict:
                row = df_dict[example_id]
                transcription = row.get('transcription', '')
            else:
                transcription = examples.get('transcription', [''])[i] if 'transcription' in examples else ''
            
            transcriptions.append(transcription)
            
            reference = examples.get('text', [''])[i]
            
            if not reference or not transcription:
                wer = float('inf')
            else:
                wer = jiwer.wer(reference.lower(), transcription.lower())
            
            wers.append(wer)
            
            # Vectorized WER range calculation its not necessary
            if wer <= 0.10:
                wer_range = "0–0.10"
            elif wer <= 0.20:
                wer_range = "0.11–0.20"
            elif wer <= 0.30:
                wer_range = "0.21–0.30"
            elif wer <= 0.40:
                wer_range = "0.31–0.40"
            elif wer <= 0.50:
                wer_range = "0.41–0.50"
            elif wer <= 0.60:
                wer_range = "0.51-0.60"
            elif wer <= 0.70:
                wer_range = "0.61-0.70"
            elif wer <= 0.80:
                wer_range = "0.71-0.80"
            elif wer <= 0.90:
                wer_range = "0.81-0.90"
            elif wer <= 1.00:
                wer_range = "0.91-1.00"
            else:
                wer_range = ">1.00"
                
            wer_ranges.append(wer_range)
            
        except Exception as e:
            print(f"Error processing {examples['id'][i]}: {e}")
            wers.append(float('inf'))
            wer_ranges.append(">1.00")
            transcriptions.append('')
    
    # Return the batch with new fields
    examples['wer'] = wers
    examples['wer_range'] = wer_ranges
    examples['transcription'] = transcriptions
    
    return examples

In [None]:
print("Calculating WER with batch processing...")
start_time = time.time()

batch_size = 500  # Adjust based on memory
num_proc = min(32, mp.cpu_count())  # Balance between speed and memory

ds_with_wer = filtered_ds.map(
    calculate_wer_batch,
    batched=True,
    batch_size=batch_size,
    num_proc=num_proc,
    desc="Calculating WER"
)

print(f"WER calculation completed in {time.time() - start_time:.2f} seconds")

In [None]:
print("Converting to pandas...")
start_time = time.time()

result_df = ds_with_wer.to_pandas()

print(f"Pandas conversion completed in {time.time() - start_time:.2f} seconds")

In [None]:
print("Filtering and preparing final dataset...")
start_time = time.time()

# Use query for faster filtering
res_df = result_df.query('wer < 0.90')[['id', 'audio', 'audio_language', 'text', 'prompt', 'duration', 'speaker_id']]

print(f"Final filtering completed in {time.time() - start_time:.2f} seconds")
print(f"Final dataset size: {len(res_df)}")


In [None]:
print("Creating and uploading datasets...")
start_time = time.time()

cleaned_dataset = Dataset.from_pandas(res_df, preserve_index=False, features=features1)
full_dataset = Dataset.from_pandas(result_df, preserve_index=False, features=features)

In [None]:
cleaned_dataset.push_to_hub(
    "evie-8/kinyarwanda-speech-hackathon",  
    config_name='train_cleaned',
    split="train", 
    private=True, 
    token=HF_TOKEN,
    max_shard_size="500MB"
)

full_dataset.push_to_hub(
    "evie-8/kinyarwanda-hackathon", 
    split="train", 
    token=HF_TOKEN,
    max_shard_size="500MB"
)

print(f"Upload completed in {time.time() - start_time:.2f} seconds")