In [5]:
!pip install pandas datasets soundfile

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [13]:
import os
import pandas as pd
from datasets import Dataset, Audio

# Paths to your data
dataset_path = "dataset"
train_audio_path = os.path.join(dataset_path, "train_audio")
test_audio_path = os.path.join(dataset_path, "test_audio")
train_csv_path = os.path.join(dataset_path, "train.csv")
test_csv_path = os.path.join(dataset_path, "test.csv")

# Load CSV files
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Add full paths to audio files in the dataframes
train_df['audio'] = train_df['audio'].apply(lambda x: os.path.join(train_audio_path, x))
test_df['audio'] = test_df['audio'].apply(lambda x: os.path.join(test_audio_path, x))

# Combine datasets (optional: skip if uploading splits separately)
full_df = pd.concat([train_df, test_df], ignore_index=True)

# Verify the dataset structure
print(full_df.head())


                                   audio  \
0  dataset\train_audio\sample-005811.mp3   
1  dataset\train_audio\sample-009916.mp3   
2  dataset\train_audio\sample-011187.mp3   
3  dataset\train_audio\sample-013150.mp3   
4  dataset\train_audio\sample-013557.mp3   

                                                text  start   end  
0       in alchemy it's called the soul of the world    0.0  2.00  
1  from the railway station in the distance came ...    0.0  6.24  
2  it was starlight and i explained the signs of ...    0.0  6.00  
3     it's what you have always wanted to accomplish    0.0  3.00  
4  i just don't trust anyone who says they've nev...    0.0  7.00  


In [14]:
full_df

Unnamed: 0,audio,text,start,end
0,dataset\train_audio\sample-005811.mp3,in alchemy it's called the soul of the world,0.0,2.00
1,dataset\train_audio\sample-009916.mp3,from the railway station in the distance came ...,0.0,6.24
2,dataset\train_audio\sample-011187.mp3,it was starlight and i explained the signs of ...,0.0,6.00
3,dataset\train_audio\sample-013150.mp3,it's what you have always wanted to accomplish,0.0,3.00
4,dataset\train_audio\sample-013557.mp3,i just don't trust anyone who says they've nev...,0.0,7.00
...,...,...,...,...
95,dataset\test_audio\sample-002614.mp3,he rose and made his way back toward the palm ...,0.0,7.00
96,dataset\test_audio\sample-002945.mp3,an alchemist said the alchemist,0.0,4.00
97,dataset\test_audio\sample-003014.mp3,the englishman was disappointed,0.0,2.00
98,dataset\test_audio\sample-003335.mp3,what had been next to him five minutes ago was...,0.0,7.00


In [15]:
# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(full_df)

# Cast the 'audio' column to the Audio feature type
dataset = dataset.cast_column("audio", Audio())

# Save locally for verification
dataset.save_to_disk("huggingface_dataset")


Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 4040.95 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 3883.04 examples/s]


In [16]:
from huggingface_hub import HfApi
from datasets import DatasetDict

# Define the splits
train_dataset = Dataset.from_pandas(train_df).cast_column("audio", Audio())
test_dataset = Dataset.from_pandas(test_df).cast_column("audio", Audio())

# Create a DatasetDict for multiple splits
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub("Tarakeshwaran/Whisper-Train-data")

Map: 100%|██████████| 80/80 [00:00<00:00, 4701.01 examples/s]?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 184.45ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it]
Map: 100%|██████████| 20/20 [00:00<00:00, 1453.40 examples/s]?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 578.52ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.81s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Tarakeshwaran/Whisper-train-data/commit/dcc9c3c2e9840103cd67d8f127e836227f84a49d', commit_message='Upload dataset', commit_description='', oid='dcc9c3c2e9840103cd67d8f127e836227f84a49d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Tarakeshwaran/Whisper-train-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Tarakeshwaran/Whisper-train-data'), pr_revision=None, pr_num=None)