In [None]:
import pandas as pd
# Load the newly uploaded CSV file
csv_file_path = '/home/priyank/Dataset_Folder/csv/dataset_hindi_only_updated.csv'
df_audio = pd.read_csv(csv_file_path)

# Display the first few rows of the CSV to check its contents
df_audio.head()

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, Audio
import numpy as np

def create_dataset_from_csv(csv_path, train_test_split=0.99, seed=42):
    """
    Create a DatasetDict from a CSV file, keeping only audio and sentence columns.
    
    Args:
        csv_path (str): Path to the CSV file
        train_test_split (float): Proportion of data to use for training
        seed (int): Random seed for reproducibility
    
    Returns:
        DatasetDict: Dataset with only audio and sentence features
    """
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Create a copy of audio_path column
    df['audio'] = df['local_audio_path'].apply(lambda x: {'path': x})
    
    # Rename transcript to sentence
    df = df.rename(columns={'transcripts-flash-2.0-001': 'sentence'})
    
    # Keep only the required columns
    df = df[['audio', 'sentence']]
    
    # Create Dataset object
    dataset = Dataset.from_pandas(df)
    
    # Cast the audio column to Audio feature
    dataset = dataset.cast_column('audio', Audio())
    
    # Split the dataset
    dataset = dataset.train_test_split(
        train_size=train_test_split,
        seed=seed
    )
    
    # Create DatasetDict
    dataset_dict = DatasetDict({
        'train': dataset['train'],
        'test': dataset['test']
    })
    
    return dataset_dict

# Usage
csv_path = "/home/ruchirverma/Dataset_Folder/csv/whisper_train_dataset.csv"
dataset_dict = create_dataset_from_csv(csv_path)

# Verify the dataset
print("Dataset structure:")
print(dataset_dict)

# Print information about each split
print("\nTrain split features:")
print(dataset_dict['train'].features)
print(f"Number of training examples: {len(dataset_dict['train'])}")

print("\nTest split features:")
print(dataset_dict['test'].features)
print(f"Number of test examples: {len(dataset_dict['test'])}")

# Print a sample entry
print("\nSample entry from training set:")
sample = dataset_dict['train'][0]
print("Audio info:")
print(f"Path: {sample['audio']['path']}")
print(f"Sampling rate: {sample['audio']['sampling_rate']}")
print(f"Array shape: {sample['audio']['array'].shape}")
print("\nSentence:")
print(sample['sentence'])

In [None]:
dataset_dict['train'][0]

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("/home/ruchirverma/whisper_tests/whisper-collabora-finetuned/checkpoint-6000")

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("/home/ruchirverma/whisper_tests/whisper-collabora-finetuned/checkpoint-6000", language="Hindi", task="transcribe")

In [None]:
def filter_long_sequences(batch):
    # Encode the text to get token length
    tokenized = tokenizer(batch["sentence"])
    # Return True if sequence length is within limit
    return len(tokenized.input_ids) <= 448

# Filter the datasets
dataset_dict["train"] = dataset_dict["train"].filter(filter_long_sequences)
dataset_dict["test"] = dataset_dict["test"].filter(filter_long_sequences)

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("/home/ruchirverma/whisper_tests/whisper-collabora-finetuned/checkpoint-6000", language="Hindi", task="transcribe")

In [None]:
print(dataset_dict["train"][0])

In [None]:
from datasets import Audio

dataset_dict = dataset_dict.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(dataset_dict["train"][0])

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
dataset_dict = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict.column_names["train"], num_proc=1)

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("/home/ruchirverma/whisper_tests/whisper-collabora-finetuned/checkpoint-6000")

In [None]:
# Verify the columns in the dataset
print(dataset_dict["train"].column_names)


In [None]:
model.generation_config.language = "hindi"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch





In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with pad_token_id in labels for WER calculation
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Decode both predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Calculate Word Error Rate (WER)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-collabora_tiny_234hrs",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=16000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=2000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)

In [None]:
processor.save_pretrained(training_args.output_dir)

In [None]:
trainer.train()

In [None]:
kwargs = {
    "dataset_tags": "allen",
    "dataset": "allen-tts",  # a 'pretty' name for the training dataset
    "dataset_args": "config: hi, split: test",
    "language": "hi",
    "model_name": "Whisper large v3 Turbo Hi - Ruchir Verma",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-large-v3-turbo",
    "tasks": "automatic-speech-recognition",
}

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch
from jiwer import wer,cer
import time
from transformers import WhisperProcessor, WhisperForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32

# Load the original base Whisper model and processor
model_name = "openai/whisper-large-v3-turbo"  # or another base Whisper model
processor = WhisperProcessor.from_pretrained(model_name)

# Now save the tokenizer configuration in the checkpoint directory
processor.save_pretrained('/home/ruchirverma/test_models/artpark_whisper/whisper-large-turbo-hi/checkpoint-4000')

# After saving, you should be able to load it with the processor
model = WhisperForConditionalGeneration.from_pretrained('/home/ruchirverma/test_models/artpark_whisper/whisper-large-turbo-hi/checkpoint-4000').to(device)

# Perform inference (for example, transcribing an audio file)
def transcribe_audio(audio_path):
    import librosa
    start_time=time.time()
    audio, sampling_rate = librosa.load(audio_path, sr=16000)
    input_features = processor(audio, return_tensors="pt", sampling_rate=sampling_rate).input_features
    input_features = input_features.to(device)
    # Generate transcription
    with torch.no_grad():
        generated_ids = model.generate(input_features)

    transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
    fc=time.time()-start_time
    return transcription, fc

# Example usage
audio_path = '/home/anirbanmajumder/ds-prototypes/voice/data/recorded_audio_allen_molecule_inheritance/L12-Mole-Bas-Of-Inher-15092021086.wav'  # Replace with your audio file path
start=time.time()
transcription,fc = transcribe_audio(audio_path)
ground_truth="दोनों के बीच में आपको क्या ध्यान रखना है, इनमें डिफ़रेंसेज़ क्या है? डीएनए लेवल पर, प्रोकैरियोटिक लेवल पर किसने काम किया था? मेज़ेल्सन और स्टाल ने। क्रोमोसोम लेवल पर।"
werror=wer(ground_truth,transcription)
print(f"WER: {werror}")
print(f"Transcription: {transcription}")
print(f"Latency: {time.time()-start} seconds")


In [None]:
import re

# Function to remove Hindi punctuation
def remove_hindi_punctuation(input_string):
    # Define the regex pattern for Hindi punctuation
    hindi_punctuation_pattern = r"[।,!?;:”“'()–-]"  # This pattern matches common punctuation marks
    
    # Use re.sub to replace punctuation with an empty string
    cleaned_string = re.sub(hindi_punctuation_pattern, '', input_string)
    
    return cleaned_string

# Example usage
input_text = ground_truth
cleaned_text = remove_hindi_punctuation(input_text)
print("Original Text:", input_text)
print("Cleaned Text:", cleaned_text)

In [None]:
import pandas as pd
import torch
import time
import librosa
import numpy as np
import soundfile as sf
from transformers import pipeline
from pydub import AudioSegment
from pydub.silence import split_on_silence
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Function to calculate WER (Word Error Rate)
def calculate_wer(predicted, ground_truth):
    return wer(ground_truth, predicted)

# Function to calculate CER (Character Error Rate)
def calculate_cer(predicted, ground_truth):
    return cer(ground_truth, predicted)



df = pd.read_csv('/home/ruchirverma/database_scripts/test_set_hindi_stt.csv')

# Set the number of random samples you want
n_samples = 100  # Change this to your desired number of samples

# Randomly sample n rows from the DataFrame
# If you want reproducible results, set a seed: random.seed(42)
if n_samples >= len(df):
    df = df  # Use all rows if n_samples is larger than the dataset
    print(f"Using all {len(df)} rows as the sample size exceeds the dataset size")
else:
    df = df.sample(n=n_samples, random_state=42)  # random_state for reproducibility
    print(f"Randomly sampled {n_samples} rows out of {len(df)} total rows")

# Display the sampled rows
print("\nSampled rows:")
print(df[['audio_path']].head())


# Initialize lists to store the metrics
v3_first_chunk_times = []
v3_wer_list = []
v3_cer_list = []

# Loop through each row in the DataFrame to run inference and calculate metrics
for index, row in df.iterrows():
    audio_path = row['audio_path']
    ground_truth_transcript = row['transcript']
    print(f"Transcribing {audio_path}")
    start_time = time.time()
    # Start the transcription process
    
    complete,fc=transcribe_audio(audio_path)
    # Run inference for the current audio file
    
    #ground_truth_transcript,fc=remove_hindi_punctuation(ground_truth_transcript)
    # Calculate WER and CER for the current row
    current_wer = calculate_wer(complete, ground_truth_transcript)
    current_cer = calculate_cer(complete, ground_truth_transcript)
    print(f"WER: {current_wer}")
    print(f"CER: {current_cer}")
    print(f"Latency: {fc}")
    
    
    # Append the metrics to the respective lists
    v3_first_chunk_times.append(fc)
    v3_wer_list.append(current_wer)
    v3_cer_list.append(current_cer)

# Calculate average first chunk time
v3_average_first_chunk_time = np.mean(v3_first_chunk_times)

# Calculate overall WER and CER
v3_overall_wer = np.mean(v3_wer_list)
v3_overall_cer = np.mean(v3_cer_list)

# Print the results
print(f"Average First Chunk Time: {v3_average_first_chunk_time:.2f} seconds")
print(f"Overall WER: {v3_overall_wer * 100:.2f}%")
print(f"Overall CER: {v3_overall_cer * 100:.2f}%")

In [None]:
accelerate launch run_pseudo_labelling.py \
  --model_name_or_path "/home/ruchirverma/whisper_tests/whisper-medium-ft_full/checkpoint-8000" \
  --dataset_name "/home/ruchirverma/dstil_whisper/dataset" \
  --dataset_config_name "default" \
  --dataset_split_name "train+test" \
  --text_column_name "sentence" \
  --id_column_name "path" \
  --output_dir "./allen_voice_dataset" \
  --wandb_project "distil-whisper-labelling" \
  --per_device_eval_batch_size 64 \
  --dtype "bfloat16" \
  --attn_implementation "sdpa" \
  --logging_steps 500 \
  --max_label_length 256 \
  --concatenate_audio \
  --preprocessing_batch_size 500 \
  --preprocessing_num_workers 8 \
  --dataloader_num_workers 8 \
  --report_to "wandb" \
  --language "hi" \
  --task "transcribe" \
  --return_timestamps \
  --streaming False \
  --generation_num_beams 1 \
  --push_to_hub

In [None]:
python create_student_model.py \
  --teacher_checkpoint "/home/ruchirverma/whisper_tests/whisper-medium-ft_full/checkpoint-8000" \
  --encoder_layers 24 \
  --decoder_layers 12 \
  --save_dir "./distil-large-medium-allen-init"

In [None]:
accelerate launch run_distillation.py \
  --model_name_or_path "./distil-large-medium-allen-init" \
  --teacher_model_name_or_path "/home/ruchirverma/whisper_tests/whisper-medium-ft_full/checkpoint-8000" \
  --train_dataset_name "/home/ruchirverma/dstil_whisper/distil-whisper/training/allen_voice_dataset" \
  --train_split_name "train" \
  --text_column_name "sentence" \
  --train_dataset_samples "10000" \
  --eval_dataset_name "/home/ruchirverma/dstil_whisper/distil-whisper/training/allen_voice_dataset" \
  --eval_split_name "test" \
  --eval_text_column_name "sentence" \
  --eval_steps 1000 \
  --save_steps 500 \
  --warmup_steps 50 \
  --learning_rate 0.0001 \
  --lr_scheduler_type "constant_with_warmup" \
  --timestamp_probability 0.2 \
  --condition_on_prev_probability 0.2 \
  --language "hi" \
  --task "transcribe" \
  --logging_steps 25 \
  --save_total_limit 1 \
  --max_steps 20000 \
  --wer_threshold 20 \
  --per_device_train_batch_size 32 \
  --per_device_eval_batch_size 32 \
  --dataloader_num_workers 8 \
  --preprocessing_num_workers 8 \
  --ddp_timeout 7200 \
  --dtype "bfloat16" \
  --attn_implementation "sdpa" \
  --output_dir "./allen-whisper-medium" \
  --do_train \
  --do_eval \
  --gradient_checkpointing \
  --overwrite_output_dir \
  --predict_with_generate \
  --freeze_encoder \
  --freeze_embed_positions \
  --streaming False \
  --push_to_hub


24*16 and 24*12

In [None]:
!df -h

In [None]:
!nvidia-smi

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict, Audio
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing
from functools import partial
import threading
from queue import Queue
multiprocessing.cpu_count()

12