In [None]:
import pandas as pd
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("ganga4364/Dilgo-Khyentse-Rinpoche-dataset")

test_df = dataset["test"].to_pandas()
test_df.to_csv('test.csv', index=False)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
from datasets import Dataset, Audio
import logging
import requests

# Configure logging
logging.basicConfig(filename='failed_downloads.log', level=logging.ERROR, format='%(asctime)s - %(message)s')

# Initialize generator pipelines
generator1 = pipeline(task="automatic-speech-recognition", model="ganga4364/mms_300_khentse_Rinpoche-Checkpoint-58000")
generator2 = pipeline(task="automatic-speech-recognition", model="ganga4364/mms_300_v4.96000")

# Function to download and validate audio
def download_audio(row):
    file_name = os.path.basename(row["url"])
    save_path = f"./downloads/{file_name}"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    if os.path.exists(save_path):  # Skip if file exists
        return save_path

    try:
        response = requests.get(row["url"], timeout=10)
        response.raise_for_status()  # Check for HTTP errors
        with open(save_path, 'wb') as f:
            f.write(response.content)
        return save_path

    except Exception as e:
        logging.error(f"Failed to download {row['file_name']}: {e}")
        return None  # Return None if download failed

# Function to process inference in batches
def process_inference(batch):
    # Perform inference with generator1
    results1 = generator1(batch["audio"]["array"])
    batch["inference_ft"] = [result["text"] for result in results1]

    # Perform inference with generator2
    results2 = generator2(batch["audio"]["array"])
    batch["inference_base_model"] = [result["text"] for result in results2]

    return batch


In [None]:
#path to test file
input_file = "test.csv"
output_dir = "chunks_test"
output_file = "processed_test.csv"

os.makedirs(output_dir, exist_ok=True)

In [None]:

for i, chunk in enumerate(pd.read_csv(input_file, chunksize=1000), start=1):
    if chunk.empty:
        print(f"Chunk {i} is empty. Skipping.")
        continue

    chunk_file = os.path.join(output_dir, f"chunk_{i}.csv")
    if os.path.exists(chunk_file):
        print(f"Skipping chunk {i}, already processed.")
        continue

    tqdm.pandas(desc="Downloading audio files")
    chunk["path"] = chunk.progress_apply(download_audio, axis=1)
    chunk = chunk[chunk["path"].notnull()]  # Remove rows with failed downloads

    # Reset index to avoid duplicate field errors
    chunk.reset_index(drop=True, inplace=True)

    # Convert to Dataset
    dataset = Dataset.from_pandas(chunk)
    dataset = dataset.cast_column("path", Audio())

    # Perform batched inference
    dataset = dataset.map(process_inference, batched=True, batch_size=8)

    # Save processed chunk to CSV
    dataset.to_pandas().to_csv(chunk_file, index=False)
    print(f"Saved chunk {i} to {chunk_file}")


In [None]:
# Merge all chunk files into final output
all_chunks = [pd.read_csv(os.path.join(output_dir, f)) for f in sorted(os.listdir(output_dir)) if f.endswith(".csv")]
final_df = pd.concat(all_chunks, ignore_index=True)
final_df.to_csv(output_file, index=False)
print(f"All chunks merged and saved to {output_file}")
