In [None]:
# src/preprocess.py
"""
Transcription and preprocessing pipeline:
1. Transcribe audio files to Urdu text (uses whisper if available).
2. Normalize Urdu text (strip diacritics, normalize alef/yeh forms, remove extra spaces).
3. Train SentencePiece model on all transcriptions to build vocabulary.
4. Produce train/val/test splits and save tokenized files.
"""

import os
import glob
import csv
import random
import re
from pathlib import Path
import sentencepiece as spm
import pandas as pd
from tqdm import tqdm

# Optional whisper import. If not available, instruct students to install or use other ASR.
try:
    import whisper
    WHISPER_AVAILABLE = True
except Exception:
    WHISPER_AVAILABLE = False

AUDIO_DIR = "data/audio"
TRANSCRIPT_CSV = "data/transcripts.csv"
SP_MODEL_PREFIX = "data/ur_tokenizer"
SP_VOCAB_SIZE = 8000

# ---------- 1) Transcription ----------
def transcribe_with_whisper(model_size="small"):
    assert WHISPER_AVAILABLE, "whisper is not installed. pip install openai-whisper"
    model = whisper.load_model(model_size)  # 'small' is a good compromise
    rows = []
    files = sorted(glob.glob(os.path.join(AUDIO_DIR, "*.*")))
    for fpath in tqdm(files, desc="Transcribing audio"):
        try:
            result = model.transcribe(fpath, language="ur", task="transcribe")
            text = result.get("text", "").strip()
        except Exception as e:
            print(f"Error transcribing {fpath}: {e}")
            text = ""
        rows.append({"filename": os.path.basename(fpath), "transcript": text})
    df = pd.DataFrame(rows)
    df.to_csv(TRANSCRIPT_CSV, index=False, encoding="utf-8")
    print(f"Saved transcripts to {TRANSCRIPT_CSV}")

# ---------- 2) Normalization ----------
# Simple Urdu normalization: remove harakat/diacritics and map multiple alef/yeh forms
URDU_DIACRITICS = [
    '\u0610', '\u0611', '\u0612', '\u0613', '\u0614', '\u0615', '\u0616', '\u0617',
    '\u0618', '\u0619', '\u061A', '\u064B', '\u064C', '\u064D', '\u064E', '\u064F',
    '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655'
]
DIACRITIC_PATTERN = re.compile("|".join(URDU_DIACRITICS))

def normalize_urdu(text: str) -> str:
    if not text:
        return ""
    text = text.strip()
    text = DIACRITIC_PATTERN.sub("", text)   # strip diacritics
    # normalize alef variants to simple alef (ا)
    text = re.sub(r'[أإآٱ]', 'ا', text)
    # normalize yeh variants to ی (U+06CC) commonly used in Urdu
    text = re.sub(r'[يى]', 'ی', text)
    # optionally normalize hamza/ta marbuta etc if needed
    # collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text

def normalize_all(transcript_csv=TRANSCRIPT_CSV):
    df = pd.read_csv(transcript_csv, encoding="utf-8")
    df['normalized'] = df['transcript'].fillna("").astype(str).apply(normalize_urdu)
    df.to_csv(transcript_csv, index=False, encoding="utf-8")
    print(f"Normalized transcripts saved to {transcript_csv}")

# ---------- 3) SentencePiece training ----------
def train_sentencepiece(input_csv=TRANSCRIPT_CSV, prefix=SP_MODEL_PREFIX, vocab_size=SP_VOCAB_SIZE):
    df = pd.read_csv(input_csv, encoding="utf-8")
    all_text = "\n".join(df['normalized'].dropna().astype(str).tolist())
    temp_text_path = "data/all_transcripts.txt"
    Path("data").mkdir(parents=True, exist_ok=True)
    with open(temp_text_path, "w", encoding="utf-8") as f:
        f.write(all_text)
    spm.SentencePieceTrainer.Train(
        input=temp_text_path,
        model_prefix=prefix,
        vocab_size=vocab_size,
        model_type='unigram',  # unigram often works well for morphologically rich languages
        user_defined_symbols=["<pad>", "<sos>", "<eos>", "<unk>"]
    )
    print("Trained SentencePiece model at", prefix + ".model")

# ---------- 4) Split dataset ----------
def create_splits(transcript_csv=TRANSCRIPT_CSV, out_dir="data/splits", train_frac=0.8, val_frac=0.1, test_frac=0.1, seed=42):
    df = pd.read_csv(transcript_csv, encoding="utf-8")
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    n = len(df)
    n_train = int(n * train_frac)
    n_val = int(n * val_frac)
    train = df.iloc[:n_train]
    val = df.iloc[n_train:n_train+n_val]
    test = df.iloc[n_train+n_val:]
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    train.to_csv(os.path.join(out_dir, "train.csv"), index=False, encoding="utf-8")
    val.to_csv(os.path.join(out_dir, "val.csv"), index=False, encoding="utf-8")
    test.to_csv(os.path.join(out_dir, "test.csv"), index=False, encoding="utf-8")
    print(f"Saved splits to {out_dir} (train/val/test sizes: {len(train)}/{len(val)}/{len(test)})")

# ---------- main ----------
if __name__ == "__main__":
    # 1. Transcribe
    if WHISPER_AVAILABLE:
        transcribe_with_whisper(model_size="small")  # change as necessary
    else:
        print("Whisper not available. Please transcribe audio by other means and create data/transcripts.csv")

    # 2. Normalize text
    normalize_all()

    # 3. Train SentencePiece
    train_sentencepiece()

    # 4. Create splits
    create_splits()
