In [None]:
!pip install uv

In [None]:
!uv pip install --quiet transformers
!uv pip install --quiet librosa
!uv pip install --quiet torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu128
!uv pip install --quiet datasets

In [None]:
# Ensuring the installation of Pytorch
!pip show torch

In [None]:
!uv pip install pandas

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Whisper_Finetune/FT Data - data.csv')

In [None]:
df.head()

In [None]:
# Renaming to Hugging Face convention
df = df.rename(columns={
    'rec_url_gcp': 'audio',
    'transcription_url_gcp': 'transcription'
})

In [None]:
df2 = df[['audio','transcription']]
df2.head(5)

## Installing the audio files from url path to perform audio processing

In [None]:
import os
import requests
import pandas as pd

def download_audio(row, audio_col="audio", download_dir="audio_files"):
    os.makedirs(download_dir, exist_ok=True)
    audio_url = row[audio_col]
    basename = os.path.basename(audio_url)
    local_path = os.path.join(download_dir, basename)
    if not os.path.exists(local_path):
        # Download only if not already present
        try:
            r = requests.get(audio_url)
            with open(local_path, 'wb') as f:
                f.write(r.content)
        except Exception as e:
            print(f"Failed to download {audio_url}: {e}")
    return local_path

df2["audio"] = df2.apply(download_audio, axis=1)

In [None]:
df2.head(5)

In [None]:
df2 = df2.rename(columns={
    'audio': 'audio_file_path',
    'transcription': 'transcription_data'
})

In [None]:
import json
import os
from pydub import AudioSegment
import re
import pandas as pd
import requests

def is_valid_hindi_text(text):
    """
    Check if text contains only Hindi characters and punctuation marks.
    Hindi Unicode range: \u0900-\u097F (Devanagari)
    Also allows common punctuation and whitespace
    """
    # Define allowed characters: Hindi (Devanagari), whitespace, and common punctuation
    pattern = r'^[\u0900-\u097F\s।,.!?;:()\-]+$'
    return bool(re.match(pattern, text))

def split_segment_if_needed(segment, max_duration=30):
    """
    Split a segment into multiple parts if it exceeds max_duration.
    Returns list of segments with adjusted start/end times.
    """
    duration = segment['end'] - segment['start']

    if duration <= max_duration:
        return [segment]

    # Split into multiple segments
    num_parts = int(duration / max_duration) + 1
    part_duration = duration / num_parts

    segments = []
    for i in range(num_parts):
        new_segment = segment.copy()
        new_segment['start'] = segment['start'] + (i * part_duration)
        new_segment['end'] = segment['start'] + ((i + 1) * part_duration)
        segments.append(new_segment)

    return segments

def clip_audio_from_transcription(audio_file_path, transcription_data, output_base_dir="output"):
    """
    Clip audio file based on transcription data.

    Args:
        audio_file_path: Path to the input audio file (local path)
        transcription_data: List of dictionaries with start, end, speaker_id, and text
        output_base_dir: Base directory in Google Drive for output files

    Returns:
        Tuple of (stats_dict, segments_list)
        - stats_dict: Dictionary with statistics about the processing
        - segments_list: List of dictionaries with segment info (clip_path, text)
    """
    # Get audio filename without extension for folder naming
    audio_filename = os.path.splitext(os.path.basename(audio_file_path))[0]

    # Create output directories specific to this audio file within the Google Drive base directory
    audio_output_dir = os.path.join("/content/drive/MyDrive/", output_base_dir, audio_filename)
    valid_dir = os.path.join(audio_output_dir, "valid_clips")
    invalid_dir = os.path.join(audio_output_dir, "invalid_clips")
    os.makedirs(valid_dir, exist_ok=True)
    os.makedirs(invalid_dir, exist_ok=True)

    # Load the audio file
    print(f"\nProcessing: {audio_file_path}")
    try:
        audio = AudioSegment.from_file(audio_file_path)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return {
            "audio_file": audio_filename,
            "status": "error",
            "error": str(e),
            "valid_clips": 0,
            "invalid_clips": 0
        }, []

    # Track clip counts per speaker
    speaker_clip_counts = {}
    valid_count = 0
    invalid_count = 0
    segments_info = []

    # Process each transcription segment
    for segment in transcription_data:
        speaker_id = segment['speaker_id']
        text = segment['text']

        # Split segment if longer than 30 seconds
        split_segments = split_segment_if_needed(segment, max_duration=30)

        for sub_segment in split_segments:
            start_time = sub_segment['start']
            end_time = sub_segment['end']

            # Initialize speaker count if not exists
            if speaker_id not in speaker_clip_counts:
                speaker_clip_counts[speaker_id] = 0

            speaker_clip_counts[speaker_id] += 1
            clip_number = speaker_clip_counts[speaker_id]

            # Create clip filename
            clip_filename = f"{speaker_id}_{clip_number}.wav"

            # Check if text is valid Hindi
            is_valid = is_valid_hindi_text(text)

            # Output directory selection
            if is_valid:
                output_path = os.path.join(valid_dir, clip_filename)
                valid_count += 1
            else:
                output_path = os.path.join(invalid_dir, clip_filename)
                invalid_count += 1
                print(f"  Invalid text in {clip_filename}: {text[:50]}...")

            # Extract audio clip (convert seconds to milliseconds)
            start_ms = int(start_time * 1000)
            end_ms = int(end_time * 1000)

            try:
                audio_clip = audio[start_ms:end_ms]
                audio_clip.export(output_path, format="wav")

                # Store segment information
                segments_info.append({
                    "clip_path": output_path,
                    "transcript": text,
                    "speaker_id": speaker_id,
                    "start_time": start_time,
                    "end_time": end_time,
                    "duration": end_time - start_time,
                    "is_valid_hindi": is_valid,
                    "original_audio": audio_filename
                })

            except Exception as e:
                print(f"  Error creating clip {clip_filename}: {e}")
                continue

    stats = {
        "audio_file": audio_filename,
        "status": "success",
        "valid_clips": valid_count,
        "invalid_clips": invalid_count,
        "total_clips": valid_count + invalid_count,
        "speakers": len(speaker_clip_counts),
        "output_dir": audio_output_dir
    }

    print(f"  ✓ Created {valid_count} valid clips, {invalid_count} invalid clips")
    print(f"  Output: {audio_output_dir}")

    return stats, segments_info

def fetch_transcription_from_url(url):
    """
    Fetch transcription data from a URL (JSON file).
    Args:
        url: URL to the JSON file containing transcription data
    Returns:
        List of transcription segments or None if error
    """
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        transcription = response.json()
        return transcription
    except requests.exceptions.RequestException as e:
        print(f"  Error fetching transcription from {url}: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"  Error parsing JSON from {url}: {e}")
        return None

def process_dataframe(df, audio_col="audio_file_path", transcription_col="transcription_data",
                     output_base_dir="output"):
    """
    Process a pandas DataFrame with audio files and transcription data.
    Args:
        df: Pandas DataFrame
        audio_col: Name of column containing audio file paths
        transcription_col: Name of column containing transcription data URLs or data
        output_base_dir: Base directory in Google Drive for all output files

    Returns:
        Tuple of (summary_df, segments_df)
        - summary_df: DataFrame with processing statistics per audio file
        - segments_df: DataFrame with each audio segment and its transcript
    """
    results = []
    all_segments = []

    print(f"Processing {len(df)} audio files...")
    print("=" * 60)

    for idx, row in df.iterrows():
        audio_path = row[audio_col]
        transcription = row[transcription_col]

        # Check if transcription is a URL
        if isinstance(transcription, str) and (transcription.startswith('http://') or
                                               transcription.startswith('https://')):
            print(f"\nFetching transcription from URL...")
            transcription = fetch_transcription_from_url(transcription)
            if transcription is None:
                results.append({
                    "audio_file": os.path.basename(audio_path),
                    "status": "error",
                    "error": "Failed to fetch transcription from URL",
                    "valid_clips": 0,
                    "invalid_clips": 0
                })
                continue

        # Handle transcription data if it's a JSON string (not URL)
        elif isinstance(transcription, str):
            try:
                transcription = json.loads(transcription)
            except json.JSONDecodeError as e:
                print(f"Error parsing transcription for {audio_path}: {e}")
                results.append({
                    "audio_file": os.path.basename(audio_path),
                    "status": "error",
                    "error": "Invalid JSON in transcription",
                    "valid_clips": 0,
                    "invalid_clips": 0
                })
                continue

        # Process the audio file and collect segment info
        stats, segments = clip_audio_from_transcription(audio_path, transcription, output_base_dir)
        results.append(stats)
        all_segments.extend(segments)

    print("\n" + "=" * 60)
    print("Processing Complete!")

    # Create summary DataFrame
    results_df = pd.DataFrame(results)

    # Create segments DataFrame
    segments_df = pd.DataFrame(all_segments)

    # Print summary
    print(f"\nSummary:")
    print(f"  Total audio files processed: {len(results_df)}")
    print(f"  Successful: {(results_df['status'] == 'success').sum()}")
    print(f"  Errors: {(results_df['status'] == 'error').sum()}")
    print(f"  Total valid clips: {results_df['valid_clips'].sum()}")
    print(f"  Total invalid clips: {results_df['invalid_clips'].sum()}")
    print(f"  Total segments: {len(segments_df)}")

    return results_df, segments_df

# Example usage
if __name__ == "__main__":

    data = df2

    # Process all audio files in the DataFrame
    # Specify the Google Drive path for output_base_dir
    results_df,segment_df = process_dataframe(data, output_base_dir="Whisper_Finetune/output_clips")

    # Save results to CSV in Google Drive
    results_df.to_csv("/content/drive/MyDrive/Whisper_Finetune/processing_results.csv", index=False)
    segment_df.to_csv("/content/drive/MyDrive/Whisper_Finetune/segments_results.csv", index=False)

    print(f"\nResults saved to: /content/drive/MyDrive/Whisper_Finetune/processing_results.csv and segments_results.csv")

    # Display results
    print("\nDetailed Results:")
    print(results_df)

## **Data Processing Checkpoint**
### Pre processing part has been completed and audio clips for training have been achieved.
### New Model trainings can be resumed from this point since the audio data has been processed according to model configuration and saved to disk for future use

In [None]:
import pandas as pd
segment_df = pd.read_csv('/content/drive/MyDrive/Whisper_Finetune/segments_results.csv')

In [None]:
segment_df.head(10)

In [None]:
final_df = segment_df[segment_df['is_valid_hindi'] == True]
final_df.head(20)

In [None]:
# Logging in to hugging face to upload model
from huggingface_hub import notebook_login
notebook_login()

In [None]:
data_df = final_df[['clip_path', 'transcript']]
data_df = data_df.rename(columns={
    'clip_path': 'audio',
    'transcript': 'sentence'
})
data_df.head(5)

In [None]:
!uv pip install --quiet evaluate tensorboard librosa gradio

In [None]:
!uv pip install --quiet jiwer

In [None]:
# !pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio librosa

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import torch
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import librosa

# 1. Clearing CUDA cache before model training
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# 2. Mounting Google Drive for loading data file
gdrive_path = "/content/drive/MyDrive/Whisper_Finetune/"

# 3. Create Hugging Face Dataset from the pandas DataFrame
dataset = Dataset.from_pandas(data_df)

# Removing the pandas DataFrame to free up memory
del data_df

# Splitting the dataset into train and test datasets
train_test_split = dataset.train_test_split(test_size=0.1)
common_voice_datasets = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"],
})

# 4. Loading Processor
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name, language="hindi", task="transcribe")

# 5. Pre-process Data (with on-the-fly audio loading)
def prepare_dataset(batch):
    # Load and resample audio data from the file path
    audio = batch["audio"]
    try:
        # librosa.load returns a numpy array and the sampling rate
        audio_array, sampling_rate = librosa.load(audio, sr=16000)
        batch["input_features"] = processor(audio_array, sampling_rate=sampling_rate).input_features[0]
    except Exception as e:
        print(f"Error processing {audio}: {e}")
        # In case of an error, you might want to return an empty feature set or handle it appropriately
        batch["input_features"] = []


    # Process the transcriptions with proper padding token handling
    batch["labels"] = processor.tokenizer(
        batch["sentence"],
        truncation=True,
        max_length=448  # Match Whisper's max length
    ).input_ids

    return batch

# Use .map() to apply the function. Audio is loaded and processed here.
# The original 'audio' and 'sentence' columns are kept for now.
# You can set remove_columns later if needed.
# NEW CODE FOR DEBUGGING
print("Mapping dataset and processing audio on-the-fly (with a single process for debugging)...")
common_voice_datasets = common_voice_datasets.map(prepare_dataset) # num_proc removed, defaults to 1
print("Dataset mapping complete.")


# 6. Data Collator and Metrics (same as before)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they need different padding
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # Pad input features
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        # Pad labels using tokenizer in one call (this is the efficient way)
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        # Replace padding with -100 to ignore in loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        # Remove decoder_input_ids if present
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# 7. Loading Model
model = WhisperForConditionalGeneration.from_pretrained(model_name)
# You can remove 'model.config.use_cache = False' as it's not needed for this solution.
model.generation_config.language = "hindi"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None
# 8. Define Training Arguments - ALTERNATE SOLUTION
training_args = Seq2SeqTrainingArguments(
    output_dir=gdrive_path + "whisper-small-hi-custom",
    # 1. Reducing batch size to prevent out-of-memory errors.
    per_device_train_batch_size=8,
    # 2. Disabling gradient checkpointing, which was caused error.
    gradient_checkpointing=False,
    # 3. Using gradient accumulation to maintain an effective batch size of 16 (8 * 2).
    gradient_accumulation_steps=2,
    fp16=True,
    learning_rate=1e-5,
    warmup_steps=500,
    # 4. Changing from 4000 -> 2000 for avoiding overfiting and faster training time
    max_steps=2000,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

# 9. Initializing Trainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice_datasets["train"],
    eval_dataset=common_voice_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

# 10. Starting Training
print("\nStarting fine-tuning on Colab GPU...")
trainer.train()
print("Fine-tuning completed.")

# 11. Saving the final model to Google Drive
trainer.save_model(gdrive_path + "whisper-small-hi-custom-final")
print("Model saved to Google Drive.")

In [None]:
# After training, saving the processor and tokenizer
gdrive_path = "/content/drive/MyDrive/Whisper_Finetune_NEW/"
trainer.save_model(gdrive_path + "whisper-small-hi-custom-final-new")
processor.save_pretrained(gdrive_path + "whisper-small-hi-custom-final-new")
print("Model and processor saved to Google Drive.")


In [None]:
trainer.push_to_hub("Pranav13/whisper-small-hi-custom-final-new")
processor.push_to_hub("Pranav13/whisper-small-hi-custom-final-new")

In [None]:
model.push_to_hub("Pranav13/whisper-small-hi-custom-final-new")
processor.push_to_hub("Pranav13/whisper-small-hi-custom-final-new")