In [1]:
!pip install accelerate==0.26.1 --upgrade --no-cache-dir

Collecting accelerate==0.26.1
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10.0->accelerate==0.26.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.10.0->accelerate==0.26.1)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.10.0->accelerate==0.26.1)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.10.0->accelerate==0.26.1)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.10.0->accelerate==0.26.1)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

In [2]:
# Completely delete broken transformers packages and peft
!rm -rf /usr/local/lib/python3.11/dist-packages/transformers
!rm -rf /usr/local/lib/python3.11/dist-packages/transformers-*
!pip uninstall -y transformers
!pip uninstall -y peft

# Reinstall transformers and its dependencies
!pip install transformers==4.37.2 --no-cache-dir
!pip install datasets librosa jiwer soundfile wandb pyarrow git-lfs --quiet

# Explicitly install a compatible numpy version before jax
!pip install numpy>=1.21.0

!pip install jax jaxlib # Ensure jax is installed, which transformers might depend on

[0mFound existing installation: peft 0.15.2
Uninstalling peft-0.15.2:
  Successfully uninstalled peft-0.15.2
Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m107.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m322.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
  

In [3]:
# Set up and clone the dataset
!git lfs install
!git clone https://huggingface.co/datasets/ylacombe/english_dialects


Git LFS initialized.
Cloning into 'english_dialects'...
remote: Enumerating objects: 157, done.[K
remote: Total 157 (delta 0), reused 0 (delta 0), pack-reused 157 (from 1)[K
Receiving objects: 100% (157/157), 24.24 KiB | 12.12 MiB/s, done.
Resolving deltas: 100% (47/47), done.
Filtering content: 100% (28/28), 8.36 GiB | 160.32 MiB/s, done.


In [4]:
import pyarrow.parquet as pq
from datasets import Dataset, Audio

# Load the exact Parquet files
irish_table = pq.read_table("/content/english_dialects/irish_male/train-00000-of-00001-876ed4aebc6599d3.parquet")
scottish_table = pq.read_table([
    "/content/english_dialects/scottish_male/train-00000-of-00002-c0ace91149bc30ae.parquet",
    "/content/english_dialects/scottish_male/train-00001-of-00002-58d01ae306d0a012.parquet"
])

# Convert to Hugging Face Datasets
irish = Dataset.from_dict(irish_table.to_pydict()).cast_column("audio", Audio(sampling_rate=16000))
scottish = Dataset.from_dict(scottish_table.to_pydict()).cast_column("audio", Audio(sampling_rate=16000))


In [5]:
import librosa
import soundfile as sf
import tempfile
from datasets import Dataset

def prepare_dataset_for_whisper(irish, scottish, max_samples=100):
    data = {"audio": [], "transcription": []}

    for sample in irish.select(range(min(max_samples, len(irish)))):
        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp:
            sf.write(temp.name, audio, 16000)
            data["audio"].append(temp.name)
        data["transcription"].append(sample["text"])

    for sample in scottish.select(range(min(max_samples, len(scottish)))):
        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp:
            sf.write(temp.name, audio, 16000)
            data["audio"].append(temp.name)
        data["transcription"].append(sample["text"])

    dataset = Dataset.from_dict(data)
    return dataset.train_test_split(test_size=0.2, seed=42)


In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load Whisper model and processor
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
tokenizer = processor.tokenizer  # Optional, useful for tokenizing text labels
model = WhisperForConditionalGeneration.from_pretrained(model_name)

print("✅ Whisper model and processor loaded successfully.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

✅ Whisper model and processor loaded successfully.


In [7]:
def prepare_dataset(dataset):
    def preprocess_function(examples):
        audio, sr = librosa.load(examples["audio"], sr=16000)
        inputs = processor.feature_extractor(audio, sampling_rate=sr, return_tensors="pt")
        labels = tokenizer(
            examples["transcription"],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=448,
        ).input_ids
        return {
            "input_features": inputs.input_features[0],
            "labels": labels[0],
        }

    return dataset.map(preprocess_function, remove_columns=["audio", "transcription"])


In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.nn.utils.rnn import pad_sequence
import torch
import wandb

def fine_tune_whisper(dataset):
    # Initialize W&B explicitly with project and run name
    wandb.init(project="whisper-finetuning", name="whisper-finetuning-run", reinit=True)

    training_args = Seq2SeqTrainingArguments(
        output_dir="./whisper-finetuned",
        run_name="whisper-finetuning-run",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=1e-5,
        warmup_steps=50,
        max_steps=500,
        fp16=False,  # disable FP16 for CPU / non-CUDA
        evaluation_strategy="steps",
        eval_steps=50,
        logging_dir="./logs",
        logging_steps=25,
        save_steps=100,
        save_total_limit=2,
        push_to_hub=False,
        report_to="wandb",  # enables logging to W&B
        load_best_model_at_end=True,
    )

    def data_collator(batch):
        input_features = [torch.tensor(ex["input_features"]) for ex in batch]
        labels = [ex["labels"] for ex in batch]
        max_len = max(f.shape[1] for f in input_features)
        padded_input_features = [torch.nn.functional.pad(f, (0, max_len - f.shape[1])) for f in input_features]
        padded_labels = pad_sequence([torch.tensor(l) for l in labels], batch_first=True, padding_value=-100)
        return {"input_features": torch.stack(padded_input_features), "labels": padded_labels}

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=processor.feature_extractor,
        data_collator=data_collator,
    )

    trainer.train()
    trainer.save_model("./whisper-finetuned")
    processor.save_pretrained("./whisper-finetuned")
    wandb.finish()


In [9]:
dataset = prepare_dataset_for_whisper(irish, scottish, max_samples=100)
dataset = prepare_dataset(dataset)

print("Training...")
fine_tune_whisper(dataset)
print("Training complete!")


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabdelrhmanabdelaziz2003[0m ([33mabdelrhmanabdelaziz2003-eslsca-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,0.06,0.053183
100,0.0298,0.042176
150,0.0109,0.017491
200,0.0005,0.01685
250,0.0001,0.016878
300,0.0001,0.016877
350,0.0001,0.016897
400,0.0001,0.016908
450,0.0001,0.016921
500,0.0001,0.016927


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618

0,1
eval/loss,█▆▁▁▁▁▁▁▁▁
eval/runtime,█▄▄▆▃▅▄▅▁▅
eval/samples_per_second,▁▅▅▃▆▄▅▄█▄
eval/steps_per_second,▁▅▅▂▆▄▅▄█▄
train/epoch,▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇████
train/learning_rate,▅██▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.01693
eval/runtime,6.6683
eval/samples_per_second,5.999
eval/steps_per_second,0.75
train/epoch,25.0
train/global_step,500.0
train/learning_rate,0.0
train/loss,0.0001
train/total_flos,1.15434160128e+18
train/train_loss,0.03672


Training complete!


In [10]:
from jiwer import wer

def evaluate_wer(test_data):
    model = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned")
    processor = WhisperProcessor.from_pretrained("./whisper-finetuned")
    model.eval().to("cuda" if torch.cuda.is_available() else "cpu")

    references, predictions = [], []
    for sample in test_data["test"]:
        audio_path = sample["audio"]
        true = sample["transcription"]
        try:
            audio, sr = librosa.load(audio_path, sr=16000)
            inputs = processor(audio, sampling_rate=sr, return_tensors="pt").to(model.device)
            with torch.no_grad():
                pred_ids = model.generate(input_features=inputs.input_features)
            pred = processor.decode(pred_ids[0], skip_special_tokens=True)
            if pred.strip() and true.strip():
                references.append(true.strip())
                predictions.append(pred.strip())
        except Exception as e:
            print("Skipped:", e)
            continue

    # Compute WER
    score = wer(references, predictions)

    # Compute val_acc (exact match accuracy)
    correct = sum(1 for r, p in zip(references, predictions) if r.strip().lower() == p.strip().lower())
    val_acc = correct / len(references) if references else 0.0

    return score, val_acc, references, predictions


In [11]:
test_data = prepare_dataset_for_whisper(irish, scottish, max_samples=20)
wer_score, val_acc, refs, preds = evaluate_wer(test_data)

print(f"WER: {wer_score * 100:.2f}%")
print(f"Validation Accuracy: {val_acc * 100:.2f}%")

for r, p in zip(refs, preds):
    print(f"GT: {r}\nPR: {p}\n")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


WER: 0.77%
Validation Accuracy: 100.00%
GT: All Scotch must be aged in oak barrels for at least three years
PR: All Scotch must be aged in oak barrels for at least three years

GT: You can view and manage your entire Google account history from my activity
PR: You can view and manage your entire google account history from my activity

GT: A brochure is an informative paper document that can be folded into a template pamphlet or leaflet
PR: A brochure is an informative paper document that can be folded into a template pamphlet or leaflet

GT: Well it's no fun having a pet that moves at the speed of light anyway
PR: Well it's no fun having a pet that moves at the speed of light anyway

GT: The powers of the appointed chair are limited so that the chair can't adjourn a meeting at any point without the majority vote
PR: The powers of the appointed chair are limited so that the chair can't adjourn a meeting at any point without the majority vote

GT: You have messages from Jessica
PR: You 

In [12]:
!pip install evaluate --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import evaluate

# Create dataset from your texts
texts = [sample['text'] for sample in irish] + [sample['text'] for sample in scottish]
labels = [0]*len(irish) + [1]*len(scottish)  # 0 = Irish, 1 = Scottish
dataset = Dataset.from_dict({'text': texts, 'label': labels}).train_test_split(test_size=0.2)

# Load tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)
dataset = dataset.map(tokenize, batched=True)

# Evaluation metric
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

# Training config
args = TrainingArguments(
    output_dir="dialect_classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1679 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.516021,0.797619
2,No log,0.509923,0.797619
3,No log,0.514117,0.797619


TrainOutput(global_step=315, training_loss=0.5222009083581349, metrics={'train_runtime': 21.0394, 'train_samples_per_second': 239.408, 'train_steps_per_second': 14.972, 'total_flos': 46915192056816.0, 'train_loss': 0.5222009083581349, 'epoch': 3.0})

In [14]:
trainer.save_model("dialect_classifier")
tokenizer.save_pretrained("dialect_classifier")


('dialect_classifier/tokenizer_config.json',
 'dialect_classifier/special_tokens_map.json',
 'dialect_classifier/vocab.txt',
 'dialect_classifier/added_tokens.json',
 'dialect_classifier/tokenizer.json')

In [15]:
tokenizer = AutoTokenizer.from_pretrained("dialect_classifier")
model_classifier = AutoModelForSequenceClassification.from_pretrained("dialect_classifier")


In [16]:
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load your fine-tuned Whisper processor and model only once
processor = WhisperProcessor.from_pretrained("./whisper-finetuned")
model_whisper = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned")

# Load your trained text classifier
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("dialect_classifier")
model_classifier = AutoModelForSequenceClassification.from_pretrained("dialect_classifier")

# Predict transcription + dialect
def predict_dialect_from_audio(audio_path):
    # Transcribe using your fine-tuned Whisper model
    audio, sr = librosa.load(audio_path, sr=16000)
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
    pred_ids = model_whisper.generate(inputs.input_features)
    transcription = processor.decode(pred_ids[0], skip_special_tokens=True)

    # Predict dialect using your trained classifier
    encoded = tokenizer(transcription, return_tensors="pt", truncation=True, padding=True)
    output = model_classifier(**encoded)
    pred = output.logits.argmax(dim=1).item()
    label = "Irish" if pred == 0 else "Scottish"

    return transcription, label


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
from google.colab import files

uploaded = files.upload()
filename = list(uploaded.keys())[0]

transcription, dialect = predict_dialect_from_audio(filename)

print("🗣️ Transcription:", transcription)
print("🌍 Predicted Dialect:", dialect)


Saving Scottish .wav to Scottish .wav
🗣️ Transcription: As fresh snow blanketed the grounds of the kingdom the white knight gazed out upon the sprawling valley side to himself and said I must be the loneliest knight in all the land All of a sudden the white knight spotted a strange creature wandering up the snowy path towards him As the distance between the knight and the creature shrank he saw that it was a cow uh who goes there the white knight stammered to his surprise a gentle voice responded it is I Maria a calf who has found herself far from home
🌍 Predicted Dialect: Scottish
