In [5]:
!pip install pandas datasets soundfile

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [1]:
import os
import pandas as pd
from datasets import Dataset, Audio

# Paths to your data
dataset_path = "dataset"
train_audio_path = os.path.join(dataset_path, "train_audio")
test_audio_path = os.path.join(dataset_path, "test_audio")
train_csv_path = os.path.join(dataset_path, "train.csv")
test_csv_path = os.path.join(dataset_path, "test.csv")

# Load CSV files
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Add full paths to audio files in the dataframes
train_df['audio'] = train_df['audio'].apply(lambda x: os.path.join(train_audio_path, x))
test_df['audio'] = test_df['audio'].apply(lambda x: os.path.join(test_audio_path, x))

# Combine datasets (optional: skip if uploading splits separately)
full_df = pd.concat([train_df, test_df], ignore_index=True)

# Verify the dataset structure
print(full_df.head())


  from .autonotebook import tqdm as notebook_tqdm


                                   audio  \
0  dataset\train_audio\sample-005811.mp3   
1  dataset\train_audio\sample-009916.mp3   
2  dataset\train_audio\sample-011187.mp3   
3  dataset\train_audio\sample-013150.mp3   
4  dataset\train_audio\sample-013557.mp3   

                                                text  start   end  
0       in alchemy it's called the soul of the world    0.0  2.00  
1  from the railway station in the distance came ...    0.0  6.24  
2  it was starlight and i explained the signs of ...    0.0  6.00  
3     it's what you have always wanted to accomplish    0.0  3.00  
4  i just don't trust anyone who says they've nev...    0.0  7.00  


In [4]:
from huggingface_hub import HfApi
from datasets import DatasetDict

# Define the splits
train_dataset = Dataset.from_pandas(train_df).cast_column("audio", Audio(sampling_rate=16000))
test_dataset = Dataset.from_pandas(test_df).cast_column("audio", Audio(sampling_rate=16000))

# Create a DatasetDict for multiple splits
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub("Tarakeshwaran/Whisper-Train-data")

Map: 100%|██████████| 80/80 [00:00<00:00, 4111.46 examples/s]?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 70.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.74s/it]
Map: 100%|██████████| 20/20 [00:00<00:00, 2276.36 examples/s]?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 497.49ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.29s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Tarakeshwaran/Whisper-train-data/commit/72eff7f255fc8c8a2e4c1e95606ff8fc5d843b93', commit_message='Upload dataset', commit_description='', oid='72eff7f255fc8c8a2e4c1e95606ff8fc5d843b93', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Tarakeshwaran/Whisper-train-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Tarakeshwaran/Whisper-train-data'), pr_revision=None, pr_num=None)

In [7]:
dataset_dict["train"][0]

{'audio': {'path': 'dataset\\train_audio\\sample-005811.mp3',
  'array': array([-3.10192730e-25,  8.27180613e-25, -4.65289095e-24, ...,
         -1.07697597e-05,  2.15554501e-05, -2.88033916e-05]),
  'sampling_rate': 16000},
 'text': "in alchemy it's called the soul of the world",
 'start': 0.0,
 'end': 2.0}