In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install datasets



In [3]:
from datasets import load_dataset, Audio
from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
import torch

In [4]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hg_key = user_secrets.get_secret("hg-main")
login(token=hg_key, add_to_git_credential=False)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
dataset_name = "PhanithLIM/asr-wmc-evaluate"
dataset = load_dataset(dataset_name, split="test")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
print(dataset)

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/154M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/334 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'text', 'mms', 'whisper-tiny-aug-7-may-lightning-v1', 'whisper-base-aug-20-april-lightning-v1', 'whisper-small-khmer', 'google_api'],
    num_rows: 334
})


In [6]:
model_id = 'PhanithLIM/whisper-medium-aug-05-june'
column_name = model_id.split('/')[1]
model = WhisperForConditionalGeneration.from_pretrained(model_id).to('cuda')
processor = WhisperProcessor.from_pretrained(model_id, language='Khmer', task="transcribe")

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [7]:
def map_to_pred(batch):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cuda"), language='Khmer', max_length=2048,num_beams=5)[0]
        # predicted_ids = model.generate(input_features.to("cuda"), language='Khmer')[0]
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)
    batch[f"{column_name}"] = transcription
    return batch

In [8]:
result = dataset.map(map_to_pred)
print(result)

Map:   0%|          | 0/334 [00:00<?, ? examples/s]

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Dataset({
    features: ['audio', 'text', 'mms', 'whisper-tiny-aug-7-may-lightning-v1', 'whisper-base-aug-20-april-lightning-v1', 'whisper-small-khmer', 'google_api', 'whisper-medium-aug-05-june'],
    num_rows: 334
})


In [9]:
result.push_to_hub(dataset_name, split='test')
print("Push to hub")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/334 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Push to hub
