In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git datasets torchaudio accelerate librosa jiwer

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m91.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:

In [None]:
from datasets import Dataset, Audio, DatasetDict
from transformers import WhisperProcessor
import pandas as pd
import os
import torch

from google.colab import drive
drive.mount("/content/drive")

# Paths
csv_path = "/content/drive/MyDrive/converted-dataset/transcript.csv"  # your transcript
save_path = "/content/drive/MyDrive/whisper_mapped_dataset"           # where to save mapped data

# Load data
df = pd.read_csv(csv_path)
df = df.rename(columns={"Arabic": "transcription", "Location": "audio"})
df = df[["audio", "transcription"]]

# Create Hugging Face dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))  # your audio is already 16kHz

# Load processor
processor = WhisperProcessor.from_pretrained("openai/whisper-base")

# Mapping function
def prepare_whisper_format(example):
    # Process audio
    audio = example["audio"]
    inputs = processor(audio["array"], sampling_rate=16000)
    input_features = inputs.input_features[0]

    # Tokenize transcription
    labels = processor.tokenizer(example["transcription"], truncation=True, max_length=448).input_ids

    example["input_features"] = input_features
    example["labels"] = labels
    return example

# Map the dataset (on GPU-compatible batch size)
mapped_dataset = dataset.map(prepare_whisper_format, remove_columns=["audio", "transcription"])

# Save the dataset
os.makedirs(save_path, exist_ok=True)
mapped_dataset.save_to_disk(save_path)

print(f"✅ Dataset mapped and saved to {save_path}")


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Map:   0%|          | 0/608 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/608 [00:00<?, ? examples/s]

✅ Dataset mapped and saved to /content/drive/MyDrive/whisper_mapped_dataset


In [None]:
from datasets import load_from_disk
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader
from google.colab import drive
import torch

# 1. Mount Google Drive
drive.mount("/content/drive")

# 2. Load mapped dataset (with input_features + labels)
dataset = load_from_disk("/content/drive/MyDrive/whisper_mapped_dataset")

# 3. Load processor and model
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

# 4. Custom Trainer that works with input_features
class WhisperCustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
      labels = inputs.pop("labels")
      outputs = model(**inputs, labels=labels)
      loss = outputs.loss
      return (loss, outputs) if return_outputs else loss


# 5. Dummy collator — dataset is already tokenized/padded
def dummy_data_collator(features):
    return {
        "input_features": torch.stack([torch.tensor(f["input_features"]) for f in features]),
        "labels": torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(f["labels"]) for f in features],
            batch_first=True,
            padding_value=-100
        )
    }


# 6. Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/whisper-training-output",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=10,
    save_steps=500,
    report_to="none"
)

# 7. Initialize Trainer (notice: no tokenizer!)
trainer = WhisperCustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=dummy_data_collator
)

# 8. Train!
trainer.train()


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,3.3386
20,2.0896
30,1.5359
40,1.3141
50,1.2947
60,1.1518
70,1.1369
80,1.0939
90,1.086
100,1.034




TrainOutput(global_step=456, training_loss=0.9352264184700815, metrics={'train_runtime': 378.6035, 'train_samples_per_second': 4.818, 'train_steps_per_second': 1.204, 'total_flos': 1.1830463299584e+17, 'train_loss': 0.9352264184700815, 'epoch': 3.0})

In [None]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, GenerationConfig
from google.colab import files, drive

# Upload audio
drive.mount("/content/drive")

uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load and resample if necessary
waveform, sr = torchaudio.load(filename)
if sr != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
    waveform = resampler(waveform)

# Load model and processor
model_path = "/content/drive/MyDrive/whisper-training-output/whisper-base-finetuned/"
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path)

# 🔥 REMOVE forced_decoder_ids from generation config
model.generation_config.forced_decoder_ids = None
model.generation_config.suppress_tokens = []

# Tokenize input
inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")

# Generate transcription
with torch.no_grad():
    predicted_ids = model.generate(inputs.input_features)

# Decode
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print("\n📄 Transcription:\n", transcription)


Mounted at /content/drive


Saving temp_audio.wav to temp_audio.wav


`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.
A custom logits processor of type <class 'transformers.generation.logits_process.Supp


📄 Transcription:
 َرْزُبِ ٱللَّهِ مِنَ ٱلشَّيْطَـٰنَ رَجِيمِ ١٠٤ بِسْمِ ٱللَّهُ رَحْمَـٰنَ رَحِيمِ ١٠٥ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱللَّهِ رَبِّ ٱللَّهَ لَمِينَ ١٠٦ ٱلرَّحْمَـٰنَ رَحِيمِ ١٠٧ مَا لِيَوْمِ ٱلدِّينَ ١٠٨ إِيَّا كَنَا بُدُوَءِىَّ كَرَسْتَنِينَ ١٠٩ وَمَا لِيَوْمَتَهُمُ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱلْقَوْمَةِ ٱل


In [None]:
from transformers import WhisperProcessor

# Path to your fine-tuned model folder
finetuned_model_path = "/content/drive/MyDrive/whisper-training-output/whisper-base-finetuned"

# Load the processor from the original model
processor = WhisperProcessor.from_pretrained("openai/whisper-base")

# Save the necessary processor/tokenizer files into your fine-tuned directory
processor.save_pretrained(finetuned_model_path)

print("✅ Processor files successfully saved to your fine-tuned model folder!")


✅ Processor files successfully saved to your fine-tuned model folder!
