In [68]:
import os
from huggingface_hub import snapshot_download

def download_parquets(
    repo_id: str = "ai4bharat/IndicVoices",
    repo_type: str = "dataset",
    subfolder: str = "telugu",
    target_dir: str = "./IndicVoices_telugu_parquets",
    allow_pattern: str = f"telugu/*.parquet",
    revision: str = "main"
):
    os.makedirs(target_dir, exist_ok=True)
    # snapshot_download will respect allow_patterns to filter files
    snapshot_download(
        repo_id=repo_id,
        repo_type=repo_type,
        revision=revision,
        local_dir=target_dir,
        local_dir_use_symlinks=False,
        allow_patterns=[allow_pattern]
    )
    print(f"Downloaded files into {target_dir}")

if __name__ == "__main__":
    download_parquets(target_dir="./data", subfolder="telugu")

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
Fetching 47 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47/47 [00:33<00:00,  1.40it/s]

Downloaded files into ./data





In [100]:
from tqdm import tqdm

# Define the columns to group by
group_columns = ['speaker_id', 'scenario', 'task_name', 'gender',
                 'age_group', 'job_type', 'qualification', 'area', 
                 'district', 'state', 'occupation']

# Initialize the ID column
df['id'] = 0

# Initialize the first ID
current_id = 1

# Iterate through the rows of the DataFrame with tqdm
for i in tqdm(range(len(df)), desc="Processing rows"):
    if i == 0:
        # Assign the first ID to the first row
        df.at[i, 'id'] = current_id
    else:
        # Check if the current row matches the previous row in the group columns
        if all(df.at[i, col] == df.at[i - 1, col] for col in group_columns):
            # Assign the same ID as the previous row
            df.at[i, 'id'] = current_id
        else:
            # Increment the ID and assign it to the current row
            current_id += 1
            df.at[i, 'id'] = current_id

Processing rows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2629/2629 [00:00<00:00, 8455.33it/s]


In [167]:
df.head()

Unnamed: 0,audio_filepath,text,duration,lang,samples,verbatim,normalized,speaker_id,scenario,task_name,...,area,district,state,occupation,verification_report,unsanitized_verbatim,unsanitized_normalized,id,file,segment
0,"{'bytes': b'fLaC\x00\x00\x00""\x10\x00\x10\x00\...",‡∞µ‡∞≤ ‡∞ó‡∞ü‡±ç‡∞ü‡∞ø‡∞ó‡∞æ ‡∞≤‡∞æ‡∞ó‡∞ø ‡∞ö‡±Ç‡∞∏‡±á‡∞∏‡∞∞‡∞ø‡∞ï‡∞ø ‡∞Ü ‡∞µ‡∞≤‡∞≤‡±ã ‡∞™‡±Ü‡∞¶‡±ç‡∞¶ ‡∞ö‡±á‡∞™ ‡∞ï‡∞®‡∞™...,9.184,te,146944,‡∞µ‡∞≤ ‡∞ó‡∞ü‡±ç‡∞ü‡∞ø‡∞ó‡∞æ ‡∞≤‡∞æ‡∞ó‡∞ø ‡∞ö‡±Ç‡∞∏‡±á‡∞∏‡∞∞‡∞ø‡∞ï‡∞ø ‡∞Ü ‡∞µ‡∞≤‡∞≤‡±ã ‡∞™‡±Ü‡∞¶‡±ç‡∞¶ ‡∞ö‡±á‡∞™ ‡∞ï‡∞®‡∞™...,‡∞µ‡∞≤ ‡∞ó‡∞ü‡±ç‡∞ü‡∞ø‡∞ó‡∞æ ‡∞≤‡∞æ‡∞ó‡∞ø ‡∞ö‡±Ç‡∞∏‡±á‡∞∏‡∞∞‡∞ø‡∞ï‡∞ø ‡∞Ü ‡∞µ‡∞≤‡∞≤‡±ã ‡∞™‡±Ü‡∞¶‡±ç‡∞¶ ‡∞ö‡±á‡∞™ ‡∞ï‡∞®‡∞™...,S4258915300327248,Extempore,KYP - Fishing,...,Urban,Guntur,Andhra Pradesh,Business,"{'decision': 'excellent', 'low_volume': False,...",‡∞µ‡∞≤ ‡∞ó‡∞ü‡±ç‡∞ü‡∞ø‡∞ó‡∞æ ‡∞≤‡∞æ‡∞ó‡∞ø ‡∞ö‡±Ç‡∞∏‡±á‡∞∏‡∞∞‡∞ø‡∞ï‡∞ø ‡∞Ü ‡∞µ‡∞≤‡∞≤‡±ã ‡∞™‡±Ü‡∞¶‡±ç‡∞¶ ‡∞ö‡±á‡∞™ ‡∞ï‡∞®‡∞™...,‡∞µ‡∞≤ ‡∞ó‡∞ü‡±ç‡∞ü‡∞ø‡∞ó‡∞æ ‡∞≤‡∞æ‡∞ó‡∞ø ‡∞ö‡±Ç‡∞∏‡±á‡∞∏‡∞∞‡∞ø‡∞ï‡∞ø ‡∞Ü ‡∞µ‡∞≤‡∞≤‡±ã ‡∞™‡±Ü‡∞¶‡±ç‡∞¶ ‡∞ö‡±á‡∞™ ‡∞ï‡∞®‡∞™...,1,1,1
1,"{'bytes': b'fLaC\x00\x00\x00""\x10\x00\x10\x00\...",‡∞§‡±Ä‡∞∏‡±Å‡∞ï‡±ä‡∞ö‡±ç‡∞ö‡∞ø‡∞® ‡∞§‡∞∞‡±ç‡∞µ‡∞æ‡∞§ ‡∞Æ‡∞æ‡∞ï‡±Å ‡∞§‡±Ü‡∞≤‡∞ø‡∞Ø‡∞°‡∞Ç ‡∞ú‡∞∞‡∞ø‡∞ó‡∞ø‡∞Ç‡∞¶‡∞ø,2.646,te,42336,‡∞§‡±Ä‡∞∏‡±Å‡∞ï‡±ä‡∞ö‡±ç‡∞ö‡∞ø‡∞® ‡∞§‡∞∞‡±ç‡∞µ‡∞æ‡∞§ ‡∞Æ‡∞æ‡∞ï‡±Å ‡∞§‡±Ü‡∞≤‡∞ø‡∞Ø‡∞°‡∞Ç ‡∞ú‡∞∞‡∞ø‡∞ó‡∞ø‡∞Ç‡∞¶‡∞ø,‡∞§‡±Ä‡∞∏‡±Å‡∞ï‡±ä‡∞ö‡±ç‡∞ö‡∞ø‡∞® ‡∞§‡∞∞‡±ç‡∞µ‡∞æ‡∞§ ‡∞Æ‡∞æ‡∞ï‡±Å ‡∞§‡±Ü‡∞≤‡∞ø‡∞Ø‡∞°‡∞Ç ‡∞ú‡∞∞‡∞ø‡∞ó‡∞ø‡∞Ç‡∞¶‡∞ø,S4258915300327248,Extempore,KYP - Fishing,...,Urban,Guntur,Andhra Pradesh,Business,"{'decision': 'excellent', 'low_volume': False,...",‡∞§‡±Ä‡∞∏‡±Å‡∞ï‡±ä‡∞ö‡±ç‡∞ö‡∞ø‡∞® ‡∞§‡∞∞‡±ç‡∞µ‡∞æ‡∞§ ‡∞Æ‡∞æ‡∞ï‡±Å [noise] ‡∞§‡±Ü‡∞≤‡∞ø‡∞Ø‡∞°‡∞Ç ‡∞ú‡∞∞‡∞ø‡∞ó‡∞ø‡∞Ç‡∞¶‡∞ø,‡∞§‡±Ä‡∞∏‡±Å‡∞ï‡±ä‡∞ö‡±ç‡∞ö‡∞ø‡∞® ‡∞§‡∞∞‡±ç‡∞µ‡∞æ‡∞§ ‡∞Æ‡∞æ‡∞ï‡±Å [noise] ‡∞§‡±Ü‡∞≤‡∞ø‡∞Ø‡∞°‡∞Ç ‡∞ú‡∞∞‡∞ø‡∞ó‡∞ø‡∞Ç‡∞¶‡∞ø,2,1,2
2,"{'bytes': b'fLaC\x00\x00\x00""\x10\x00\x10\x00\...",‡∞∏‡∞ø‡∞∏‡±ç‡∞ï‡±ã ‡∞∏‡∞ø‡∞∏‡±ç‡∞ü‡∞Ç,2.248,te,35968,‡∞∏‡∞ø‡∞∏‡±ç‡∞ï‡±ã ‡∞∏‡∞ø‡∞∏‡±ç‡∞ü‡±Ü‡∞Æ‡±ç,‡∞∏‡∞ø‡∞∏‡±ç‡∞ï‡±ã ‡∞∏‡∞ø‡∞∏‡±ç‡∞ü‡∞Ç,S4259469500399772,Read,Alexa Commands,...,Rural,Kakinada,Andhra Pradesh,Student,"{'decision': 'excellent', 'low_volume': False,...",‡∞∏‡∞ø‡∞∏‡±ç‡∞ï‡±ã ‡∞∏‡∞ø‡∞∏‡±ç‡∞ü‡±Ü‡∞Æ‡±ç,‡∞∏‡∞ø‡∞∏‡±ç‡∞ï‡±ã [cisco] ‡∞∏‡∞ø‡∞∏‡±ç‡∞ü‡∞Ç [system],3,2,1
3,"{'bytes': b'fLaC\x00\x00\x00""\x10\x00\x10\x00\...",‡∞π‡∞≤‡±ã,0.448,te,7168,‡∞π‡∞≤‡±ã,‡∞π‡∞≤‡±ã,S4257365000302537,Conversation,Conversation,...,Urban,Sangareddy,Telangana,Student,"{'sst': False, 'comments': '', 'decision': 'ex...",‡∞π‡∞≤‡±ã,‡∞π‡∞≤‡±ã [hello],4,3,1
4,"{'bytes': b'fLaC\x00\x00\x00""\x10\x00\x10\x00\...",‡∞π‡∞≤‡±ã ‡∞®‡∞æ ‡∞™‡±á‡∞∞‡±Å ‡∞∂‡∞∞‡∞£‡±ç ‡∞®‡±á‡∞®‡±Å ‡∞Æ‡±Ä ‡∞ï‡∞Ç‡∞™‡±Ü‡∞®‡±Ä‡∞≤‡±ã ‡∞µ‡±á‡∞ï‡±Ü‡∞®‡±ç‡∞∏‡±Ä‡∞∏‡±ç ‡∞â...,6.017,te,96272,‡∞π‡∞≤‡±ã ‡∞®‡∞æ ‡∞™‡±á‡∞∞‡±Å ‡∞∂‡∞∞‡∞£‡±ç ‡∞®‡±á‡∞®‡±Å ‡∞Æ‡±Ä ‡∞ï‡∞Ç‡∞™‡±Ü‡∞®‡±Ä‡∞≤‡±ã ‡∞µ‡±á‡∞ï‡±Ü‡∞®‡±ç‡∞∏‡±Ä‡∞∏‡±ç ‡∞â...,‡∞π‡∞≤‡±ã ‡∞®‡∞æ ‡∞™‡±á‡∞∞‡±Å ‡∞∂‡∞∞‡∞£‡±ç ‡∞®‡±á‡∞®‡±Å ‡∞Æ‡±Ä ‡∞ï‡∞Ç‡∞™‡±Ü‡∞®‡±Ä‡∞≤‡±ã ‡∞µ‡±á‡∞ï‡±Ü‡∞®‡±ç‡∞∏‡±Ä‡∞∏‡±ç ‡∞â...,S4257365000302537,Conversation,Conversation,...,Urban,Sangareddy,Telangana,Student,"{'sst': False, 'comments': '', 'decision': 'ex...",‡∞π‡∞≤‡±ã ‡∞®‡∞æ ‡∞™‡±á‡∞∞‡±Å ‡∞∂‡∞∞‡∞£‡±ç ‡∞®‡±á‡∞®‡±Å ‡∞Æ‡±Ä ‡∞ï‡∞Ç‡∞™‡±Ü‡∞®‡±Ä‡∞≤‡±ã ‡∞µ‡±á‡∞ï‡±Ü‡∞®‡±ç‡∞∏‡±Ä‡∞∏‡±ç ‡∞â...,‡∞π‡∞≤‡±ã [hello] ‡∞®‡∞æ ‡∞™‡±á‡∞∞‡±Å ‡∞∂‡∞∞‡∞£‡±ç ‡∞®‡±á‡∞®‡±Å ‡∞Æ‡±Ä ‡∞ï‡∞Ç‡∞™‡±Ü‡∞®‡±Ä‡∞≤‡±ã ‡∞µ‡±á‡∞ï‡±Ü...,5,3,2


In [110]:
import os
import pandas as pd
from tqdm import tqdm

# Define the group-by columns
group_columns = [
    'speaker_id', 'scenario', 'task_name', 'gender',
    'age_group', 'job_type', 'qualification', 'area', 
    'district', 'state', 'occupation'
]

# Replace with your target directory
target_directory = "data/telugu"

# Function to assign IDs based on group change
def assign_ids(df, group_columns):
    df = df.copy()
    df['id'] = 0
    current_id = 1

    for i in range(len(df)):
        if i == 0:
            df.at[i, 'id'] = current_id
        else:
            if all(df.at[i, col] == df.at[i - 1, col] for col in group_columns):
                df.at[i, 'id'] = current_id
            else:
                current_id += 1
                df.at[i, 'id'] = current_id

    return df

# Walk through all files
for root, _, files in os.walk(target_directory):
    for file in files:
        if file.endswith(".parquet"):
            file_path = os.path.join(root, file)
            print(f"\nProcessing: {file_path}")

            try:
                df = pd.read_parquet(file_path)
                df.sort_values(by=group_columns, inplace=True, ignore_index=True)

                # Assign IDs
                df = assign_ids(df, group_columns)

                # Print the max ID
                print(f"Max ID in {file}: {df['id'].max()}")
            except Exception as e:
                print(f"Error processing {file}: {e}")
        break


Processing: data/telugu/train-00003-of-00046.parquet
Max ID in train-00003-of-00046.parquet: 972


In [122]:
import pandas as pd
from tqdm import tqdm

# Input Parquet file path
parquet_file = "data/telugu/train-00000-of-00046.parquet"  # change this to your actual file path

# Output path
output_file = "data/te/train-00000-of-00046.parquet"

# Read the parquet file
df = pd.read_parquet(parquet_file)

# Define the columns to group by
group_columns = ['speaker_id', 'scenario', 'task_name', 'gender',
                 'age_group', 'job_type', 'qualification', 'area', 
                 'district', 'state', 'occupation']

# Initialize new columns
df['file'] = 0
df['segment'] = 0

# Start counters
file_num = 1
segment_num = 1

# Process with tqdm
for i in tqdm(range(len(df)), desc="Assigning file/segment"):
    df.at[i, 'file'] = file_num
    df.at[i, 'segment'] = segment_num
    
    if i < len(df) - 1:
        same_group = all(df.at[i, col] == df.at[i+1, col] for col in group_columns)
        if same_group:
            segment_num += 1
        else:
            file_num += 1
            segment_num = 1

# Save to new parquet
df.to_parquet(output_file, index=False)
print(f"‚úÖ Saved output to: {output_file}")

Assigning file/segment: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4777/4777 [00:00<00:00, 8265.39it/s]


‚úÖ Saved output to: data/te/train-00000-of-00046.parquet


In [131]:
df = pd.read_parquet(output_file)

# max file number
df['file'].max()

1064

In [160]:
# get all the segments of a given file

file = 14
df[df['file'] == file].drop(columns=['audio_filepath'])['text'].to_list()

['‡∞®‡∞æ‡∞ï‡±Å ‡∞™‡±Ç‡∞≤‡±ç‡∞≤‡±ã ‡∞à‡∞§ ‡∞ï‡±ä‡∞ü‡±ç‡∞ü‡∞°‡∞Ç ‡∞Ö‡∞Ç‡∞ü‡±á ‡∞á‡∞∑‡±ç‡∞ü‡∞Æ‡±Å ‡∞é‡∞Ç‡∞¶‡±Å‡∞ï‡∞Ç‡∞ü‡±á ‡∞Ü ‡∞™‡±Ç‡∞≤‡±ç‡∞≤‡±ã ‡∞®‡±Ä‡∞∞‡±Å ‡∞í‡∞ï‡±á ‡∞¶‡∞ó‡±ç‡∞ó‡∞∞ ‡∞â‡∞Ç‡∞ü‡±Å‡∞Ç‡∞¶‡∞ø ‡∞ï‡∞æ‡∞®‡±Ä ‡∞Æ‡∞®‡∞Æ‡±Å',
 '‡∞¨‡±Ä‡∞ö‡±ç ‡∞≤‡±á‡∞¶‡∞æ ‡∞∏‡∞∞‡∞∏‡±ç‡∞∏‡±Å‡∞≤‡±ã ‡∞à‡∞§ ‡∞ï‡±ä‡∞ü‡±ç‡∞ü‡∞ø‡∞®‡∞™‡±ç‡∞™‡±Å‡∞°‡±Å ‡∞®‡±Ä‡∞ü‡∞ø ‡∞™‡±ç‡∞∞‡∞µ‡∞æ‡∞π‡∞Ç ‡∞Ö‡∞®‡±á‡∞¶‡∞ø ‡∞í‡∞ï‡∞µ‡±à‡∞™‡±Å ‡∞â‡∞Ç‡∞ü‡±Å‡∞Ç‡∞¶‡∞ø ‡∞Æ‡∞∞‡∞ø‡∞Ø‡±Å ‡∞Æ‡∞®‡∞≤‡±ç‡∞®‡∞ø ‡∞Ü ‡∞™‡±ç‡∞∞‡∞µ‡∞æ‡∞π‡∞æ‡∞Ç ‡∞é‡∞ü‡±Å ‡∞µ‡±à‡∞™‡±Å ‡∞â‡∞Ç‡∞ü‡±á ‡∞Ü‡∞ü‡±Å‡∞µ‡±à‡∞™‡±Å',
 '‡∞Æ‡∞®‡∞≤‡±ç‡∞®‡∞ø ‡∞§‡±Ä‡∞∏‡±Å‡∞ï‡±Å ‡∞µ‡±Ü‡∞≥‡±ç‡∞≥‡∞ø‡∞™‡±ã‡∞§‡±Å‡∞Ç‡∞¶‡∞ø ‡∞Ö‡∞Ç‡∞¶‡±Å‡∞ï‡∞®‡∞ø ‡∞Ö‡∞ï‡±ç‡∞ï‡∞° ‡∞à‡∞§ ‡∞ï‡±ä‡∞ü‡±ç‡∞ü‡∞°‡∞Ç ‡∞ö‡∞æ‡∞≤‡∞æ ‡∞ï‡∞∑‡±ç‡∞ü‡∞Ç‡∞ó‡∞æ ‡∞â‡∞Ç‡∞ü‡±Å‡∞Ç‡∞¶‡∞ø ‡∞ï‡∞æ‡∞®‡±Ä ‡∞Æ‡∞®‡∞Ç ‡∞™‡±Ç‡∞≤‡±ç‡∞≤‡±ã ‡∞à‡∞§ ‡∞ï‡±ä‡∞°‡∞ø‡∞§‡±á',
 '‡∞Ö‡∞ï‡±ç‡∞ï‡∞° ‡∞®‡±Ä‡∞ü‡±Å ‡∞®‡±Ä‡∞∞‡±Å ‡∞í‡∞ï‡±á ‡∞í‡∞ï‡±á ‡∞∏‡±ç‡∞•‡∞ø‡∞∞‡∞Æ‡±à‡∞® ‡∞™‡±ç‡∞∞‡∞¶‡±á‡∞∂‡∞Ç‡∞≤‡±ã ‡∞â‡∞Ç‡∞ü‡∞æ‡∞Ø‡∞ø ‡∞®‡±Ä‡∞ü‡∞ø‡∞≤‡±ã ‡∞é‡∞≤‡∞æ‡∞Ç‡∞ü‡∞ø ‡∞ï‡

In [163]:
import os
import pandas as pd
from tqdm import tqdm

# Directory containing the Parquet files
input_dir = "data/telugu"

# Columns to group by
group_columns = ['speaker_id', 'scenario', 'task_name', 'gender',
                 'age_group', 'job_type', 'qualification', 'area',
                 'district', 'state', 'occupation']

# List all .parquet files in the directory
parquet_files = [f for f in os.listdir(input_dir) if f.endswith('.parquet')]

for file in tqdm(parquet_files, desc="Processing files"):
    file_path = os.path.join(input_dir, file)
    
    # Load DataFrame
    df = pd.read_parquet(file_path)
    
    # Initialize new columns
    df['file'] = 0
    df['segment'] = 0

    # Counters
    file_num = 1
    segment_num = 1

    # Assign file and segment numbers
    for i in range(len(df)):
        df.at[i, 'file'] = file_num
        df.at[i, 'segment'] = segment_num

        if i < len(df) - 1:
            same_group = all(df.at[i, col] == df.at[i+1, col] for col in group_columns)
            if same_group:
                segment_num += 1
            else:
                file_num += 1
                segment_num = 1

    # Overwrite the same file
    df.to_parquet(file_path, index=False)

    print(f"‚úÖ Updated: {file_path}")


Processing files:   2%|‚ñè         | 1/47 [00:01<01:07,  1.46s/it]

‚úÖ Updated: data/telugu/train-00003-of-00046.parquet


Processing files:   4%|‚ñç         | 2/47 [00:02<01:02,  1.38s/it]

‚úÖ Updated: data/telugu/train-00022-of-00046.parquet


Processing files:   6%|‚ñã         | 3/47 [00:04<01:13,  1.68s/it]

‚úÖ Updated: data/telugu/train-00036-of-00046.parquet


Processing files:   9%|‚ñä         | 4/47 [00:05<01:02,  1.46s/it]

‚úÖ Updated: data/telugu/train-00008-of-00046.parquet


Processing files:  11%|‚ñà         | 5/47 [00:07<01:01,  1.47s/it]

‚úÖ Updated: data/telugu/train-00029-of-00046.parquet


Processing files:  13%|‚ñà‚ñé        | 6/47 [00:08<00:55,  1.35s/it]

‚úÖ Updated: data/telugu/train-00017-of-00046.parquet


Processing files:  15%|‚ñà‚ñç        | 7/47 [00:09<00:51,  1.30s/it]

‚úÖ Updated: data/telugu/train-00013-of-00046.parquet


Processing files:  17%|‚ñà‚ñã        | 8/47 [00:11<00:59,  1.52s/it]

‚úÖ Updated: data/telugu/train-00032-of-00046.parquet


Processing files:  19%|‚ñà‚ñâ        | 9/47 [00:12<00:54,  1.43s/it]

‚úÖ Updated: data/telugu/train-00018-of-00046.parquet


Processing files:  21%|‚ñà‚ñà‚ñè       | 10/47 [00:15<01:00,  1.64s/it]

‚úÖ Updated: data/telugu/train-00026-of-00046.parquet


Processing files:  23%|‚ñà‚ñà‚ñé       | 11/47 [00:16<01:00,  1.68s/it]

‚úÖ Updated: data/telugu/train-00043-of-00046.parquet


Processing files:  26%|‚ñà‚ñà‚ñå       | 12/47 [00:17<00:48,  1.40s/it]

‚úÖ Updated: data/telugu/valid-00000-of-00001.parquet


Processing files:  28%|‚ñà‚ñà‚ñä       | 13/47 [00:18<00:44,  1.32s/it]

‚úÖ Updated: data/telugu/train-00007-of-00046.parquet


Processing files:  30%|‚ñà‚ñà‚ñâ       | 14/47 [00:20<00:50,  1.54s/it]

‚úÖ Updated: data/telugu/train-00039-of-00046.parquet


Processing files:  32%|‚ñà‚ñà‚ñà‚ñè      | 15/47 [00:22<00:48,  1.51s/it]

‚úÖ Updated: data/telugu/train-00023-of-00046.parquet


Processing files:  34%|‚ñà‚ñà‚ñà‚ñç      | 16/47 [00:23<00:42,  1.38s/it]

‚úÖ Updated: data/telugu/train-00002-of-00046.parquet


Processing files:  36%|‚ñà‚ñà‚ñà‚ñå      | 17/47 [00:25<00:44,  1.48s/it]

‚úÖ Updated: data/telugu/train-00028-of-00046.parquet


Processing files:  38%|‚ñà‚ñà‚ñà‚ñä      | 18/47 [00:26<00:42,  1.47s/it]

‚úÖ Updated: data/telugu/train-00016-of-00046.parquet


Processing files:  40%|‚ñà‚ñà‚ñà‚ñà      | 19/47 [00:28<00:44,  1.59s/it]

‚úÖ Updated: data/telugu/train-00037-of-00046.parquet


Processing files:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 20/47 [00:29<00:39,  1.48s/it]

‚úÖ Updated: data/telugu/train-00009-of-00046.parquet


Processing files:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 21/47 [00:31<00:42,  1.65s/it]

‚úÖ Updated: data/telugu/train-00033-of-00046.parquet


Processing files:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 22/47 [00:32<00:37,  1.50s/it]

‚úÖ Updated: data/telugu/train-00012-of-00046.parquet


Processing files:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 23/47 [00:34<00:35,  1.47s/it]

‚úÖ Updated: data/telugu/train-00006-of-00046.parquet


Processing files:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 24/47 [00:36<00:38,  1.69s/it]

‚úÖ Updated: data/telugu/train-00038-of-00046.parquet


Processing files:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 25/47 [00:38<00:41,  1.90s/it]

‚úÖ Updated: data/telugu/train-00042-of-00046.parquet


Processing files:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 26/47 [00:39<00:35,  1.67s/it]

‚úÖ Updated: data/telugu/train-00019-of-00046.parquet


Processing files:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 27/47 [00:41<00:32,  1.64s/it]

‚úÖ Updated: data/telugu/train-00027-of-00046.parquet


Processing files:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 28/47 [00:42<00:28,  1.50s/it]

‚úÖ Updated: data/telugu/train-00015-of-00046.parquet


Processing files:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 29/47 [00:44<00:28,  1.59s/it]

‚úÖ Updated: data/telugu/train-00034-of-00046.parquet


Processing files:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 30/47 [00:46<00:28,  1.70s/it]

‚úÖ Updated: data/telugu/train-00045-of-00046.parquet


Processing files:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 31/47 [00:47<00:26,  1.65s/it]

‚úÖ Updated: data/telugu/train-00020-of-00046.parquet


Processing files:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 32/47 [00:49<00:24,  1.63s/it]

‚úÖ Updated: data/telugu/train-00001-of-00046.parquet


Processing files:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 33/47 [00:51<00:22,  1.59s/it]

‚úÖ Updated: data/telugu/train-00005-of-00046.parquet


Processing files:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 34/47 [00:53<00:22,  1.73s/it]

‚úÖ Updated: data/telugu/train-00041-of-00046.parquet


Processing files:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 35/47 [00:54<00:21,  1.79s/it]

‚úÖ Updated: data/telugu/train-00024-of-00046.parquet


Processing files:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 36/47 [00:56<00:18,  1.68s/it]

‚úÖ Updated: data/telugu/train-00030-of-00046.parquet


Processing files:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 37/47 [00:57<00:14,  1.48s/it]

‚úÖ Updated: data/telugu/train-00011-of-00046.parquet


Processing files:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 38/47 [00:59<00:15,  1.77s/it]

‚úÖ Updated: data/telugu/train-00035-of-00046.parquet


Processing files:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 39/47 [01:00<00:12,  1.57s/it]

‚úÖ Updated: data/telugu/train-00014-of-00046.parquet


Processing files:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 40/47 [01:02<00:10,  1.43s/it]

‚úÖ Updated: data/telugu/train-00000-of-00046.parquet


Processing files:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 41/47 [01:03<00:08,  1.36s/it]

‚úÖ Updated: data/telugu/train-00021-of-00046.parquet


Processing files:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 42/47 [01:05<00:07,  1.47s/it]

‚úÖ Updated: data/telugu/train-00044-of-00046.parquet


Processing files:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 43/47 [01:07<00:06,  1.63s/it]

‚úÖ Updated: data/telugu/train-00025-of-00046.parquet


Processing files:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 44/47 [01:09<00:05,  1.74s/it]

‚úÖ Updated: data/telugu/train-00040-of-00046.parquet


Processing files:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 45/47 [01:10<00:03,  1.57s/it]

‚úÖ Updated: data/telugu/train-00004-of-00046.parquet


Processing files:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 46/47 [01:11<00:01,  1.42s/it]

‚úÖ Updated: data/telugu/train-00010-of-00046.parquet


Processing files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47/47 [01:12<00:00,  1.55s/it]

‚úÖ Updated: data/telugu/train-00031-of-00046.parquet





In [164]:
# print the max file number for all files
for root, _, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".parquet"):
            file_path = os.path.join(root, file)
            print(f"\nProcessing: {file_path}")

            try:
                df = pd.read_parquet(file_path)
                print(f"Max file number in {file}: {df['file'].max()}")
            except Exception as e:
                print(f"Error processing {file}: {e}")


Processing: data/telugu/train-00003-of-00046.parquet
Max file number in train-00003-of-00046.parquet: 1050

Processing: data/telugu/train-00022-of-00046.parquet
Max file number in train-00022-of-00046.parquet: 929

Processing: data/telugu/train-00036-of-00046.parquet
Max file number in train-00036-of-00046.parquet: 603

Processing: data/telugu/train-00008-of-00046.parquet
Max file number in train-00008-of-00046.parquet: 1025

Processing: data/telugu/train-00029-of-00046.parquet
Max file number in train-00029-of-00046.parquet: 825

Processing: data/telugu/train-00017-of-00046.parquet
Max file number in train-00017-of-00046.parquet: 1076

Processing: data/telugu/train-00013-of-00046.parquet
Max file number in train-00013-of-00046.parquet: 1018

Processing: data/telugu/train-00032-of-00046.parquet
Max file number in train-00032-of-00046.parquet: 837

Processing: data/telugu/train-00018-of-00046.parquet
Max file number in train-00018-of-00046.parquet: 962

Processing: data/telugu/train-00

In [169]:
# Rename all parquet files in data/telugu from 1.parquet to n.parquet
import os
directory = "data/telugu"
files = [f for f in os.listdir(directory) if f.endswith('.parquet')]
for index, file in enumerate(sorted(files), start=1):
    old_path = os.path.join(directory, file)
    new_filename = f"{index}.parquet"
    new_path = os.path.join(directory, new_filename)
    os.rename(old_path, new_path)
    print(f"Renamed: {old_path} -> {new_path}")

Renamed: data/telugu/train-00000-of-00046.parquet -> data/telugu/1.parquet
Renamed: data/telugu/train-00001-of-00046.parquet -> data/telugu/2.parquet
Renamed: data/telugu/train-00002-of-00046.parquet -> data/telugu/3.parquet
Renamed: data/telugu/train-00003-of-00046.parquet -> data/telugu/4.parquet
Renamed: data/telugu/train-00004-of-00046.parquet -> data/telugu/5.parquet
Renamed: data/telugu/train-00005-of-00046.parquet -> data/telugu/6.parquet
Renamed: data/telugu/train-00006-of-00046.parquet -> data/telugu/7.parquet
Renamed: data/telugu/train-00007-of-00046.parquet -> data/telugu/8.parquet
Renamed: data/telugu/train-00008-of-00046.parquet -> data/telugu/9.parquet
Renamed: data/telugu/train-00009-of-00046.parquet -> data/telugu/10.parquet
Renamed: data/telugu/train-00010-of-00046.parquet -> data/telugu/11.parquet
Renamed: data/telugu/train-00011-of-00046.parquet -> data/telugu/12.parquet
Renamed: data/telugu/train-00012-of-00046.parquet -> data/telugu/13.parquet
Renamed: data/telugu/

In [1]:
import os
import pandas as pd
from glob import glob
from tqdm import tqdm

# Paths
parquet_dir = "data/telugu"
output_dir = os.path.join(parquet_dir, "audio")
os.makedirs(output_dir, exist_ok=True)

# Get all parquet files
parquet_files = glob(os.path.join(parquet_dir, "*.parquet"))

saved_count = 0

for parquet_file in parquet_files:
    parquet_basename = os.path.splitext(os.path.basename(parquet_file))[0]

    print(f"üìÑ Processing: {parquet_file}")
    try:
        df = pd.read_parquet(parquet_file)
    except Exception as e:
        print(f"‚ö†Ô∏è Could not read {parquet_file}: {e}")
        continue

    # Check for necessary columns
    if not {'file', 'segment', 'audio_filepath'}.issubset(df.columns):
        print(f"‚ö†Ô∏è Skipping {parquet_file} ‚Äî missing required columns.")
        continue

    # Iterate over each row
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Saving from {parquet_basename}"):
        try:
            file_id = int(row['file'])
            segment_id = int(row['segment'])

            audio_data = row['audio_filepath']
            if isinstance(audio_data, dict) and 'bytes' in audio_data:
                audio_bytes = audio_data['bytes']
                filename = f"{parquet_basename}-{file_id}-{segment_id}.wav"
                output_path = os.path.join(output_dir, filename)

                if not os.path.exists(output_path):
                    with open(output_path, 'wb') as f:
                        f.write(audio_bytes)
                    saved_count += 1
            else:
                print(f"‚ö†Ô∏è Missing 'bytes' in audio_filepath for file={file_id}, segment={segment_id}")
        except Exception as e:
            print(f"‚ùå Failed to save audio for file={file_id}, segment={segment_id}: {e}")

print(f"\n‚úÖ Done. Saved {saved_count} audio files to '{output_dir}'")

üìÑ Processing: data/telugu/8.parquet


Saving from 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 8865.54it/s]


üìÑ Processing: data/telugu/34.parquet


Saving from 34: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 8049.49it/s]


üìÑ Processing: data/telugu/24.parquet


Saving from 24: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 8176.68it/s]


üìÑ Processing: data/telugu/46.parquet


Saving from 46: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 8483.12it/s]


üìÑ Processing: data/telugu/1.parquet


Saving from 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4777/4777 [00:00<00:00, 7731.14it/s]


üìÑ Processing: data/telugu/12.parquet


Saving from 12: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 8062.95it/s]


üìÑ Processing: data/telugu/13.parquet


Saving from 13: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 5477.59it/s]


üìÑ Processing: data/telugu/9.parquet


Saving from 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 8198.76it/s]


üìÑ Processing: data/telugu/25.parquet


Saving from 25: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 8626.73it/s]


üìÑ Processing: data/telugu/35.parquet


Saving from 35: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6518.17it/s]


üìÑ Processing: data/telugu/47.parquet


Saving from 47: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2629/2629 [00:00<00:00, 7732.44it/s]


üìÑ Processing: data/telugu/11.parquet


Saving from 11: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 8683.32it/s]


üìÑ Processing: data/telugu/2.parquet


Saving from 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4777/4777 [00:00<00:00, 7316.35it/s]


üìÑ Processing: data/telugu/18.parquet


Saving from 18: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6253.90it/s]


üìÑ Processing: data/telugu/45.parquet


Saving from 45: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:01<00:00, 4473.27it/s]


üìÑ Processing: data/telugu/37.parquet


Saving from 37: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6178.61it/s]


üìÑ Processing: data/telugu/27.parquet


Saving from 27: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6856.13it/s]


üìÑ Processing: data/telugu/19.parquet


Saving from 19: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 7545.56it/s]


üìÑ Processing: data/telugu/44.parquet


Saving from 44: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6805.12it/s]


üìÑ Processing: data/telugu/26.parquet


Saving from 26: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6519.93it/s]


üìÑ Processing: data/telugu/36.parquet


Saving from 36: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:01<00:00, 4234.18it/s]


üìÑ Processing: data/telugu/10.parquet


Saving from 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6659.78it/s]


üìÑ Processing: data/telugu/3.parquet


Saving from 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4777/4777 [00:00<00:00, 7199.32it/s]


üìÑ Processing: data/telugu/15.parquet


Saving from 15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6519.32it/s]


üìÑ Processing: data/telugu/6.parquet


Saving from 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4777/4777 [00:00<00:00, 7263.07it/s]


üìÑ Processing: data/telugu/41.parquet


Saving from 41: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6705.49it/s]


üìÑ Processing: data/telugu/33.parquet


Saving from 33: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 7524.90it/s]


üìÑ Processing: data/telugu/23.parquet


Saving from 23: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 7112.11it/s]


üìÑ Processing: data/telugu/40.parquet


Saving from 40: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6534.72it/s]


üìÑ Processing: data/telugu/22.parquet


Saving from 22: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6494.43it/s]


üìÑ Processing: data/telugu/32.parquet


Saving from 32: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 7016.36it/s]


üìÑ Processing: data/telugu/14.parquet


Saving from 14: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6974.40it/s]


üìÑ Processing: data/telugu/7.parquet


Saving from 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 7272.24it/s]


üìÑ Processing: data/telugu/30.parquet


Saving from 30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6635.09it/s]


üìÑ Processing: data/telugu/20.parquet


Saving from 20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 5832.46it/s]


üìÑ Processing: data/telugu/42.parquet


Saving from 42: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 5595.06it/s]


üìÑ Processing: data/telugu/29.parquet


Saving from 29: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6363.54it/s]


üìÑ Processing: data/telugu/39.parquet


Saving from 39: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6369.33it/s]


üìÑ Processing: data/telugu/5.parquet


Saving from 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4777/4777 [00:00<00:00, 6587.37it/s]


üìÑ Processing: data/telugu/16.parquet


Saving from 16: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6542.89it/s]


üìÑ Processing: data/telugu/38.parquet


Saving from 38: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6204.49it/s]


üìÑ Processing: data/telugu/28.parquet


Saving from 28: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6158.33it/s]


üìÑ Processing: data/telugu/4.parquet


Saving from 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4777/4777 [00:00<00:00, 6616.85it/s]


üìÑ Processing: data/telugu/17.parquet


Saving from 17: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 7125.63it/s]


üìÑ Processing: data/telugu/21.parquet


Saving from 21: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 7547.77it/s]


üìÑ Processing: data/telugu/31.parquet


Saving from 31: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:00<00:00, 6570.31it/s]


üìÑ Processing: data/telugu/43.parquet


Saving from 43: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4776/4776 [00:01<00:00, 3793.21it/s]


‚úÖ Done. Saved 222331 audio files to 'data/telugu/audio'





In [24]:
import pandas as pd
from IPython.display import Audio, display
import io
import base64
import ast

def maxFileCount(parquet_file):
    parquet_file = f"data/telugu/{parquet_file}.parquet"
    try:
        df = pd.read_parquet(parquet_file)
        if 'file' in df.columns:
            return df['file'].max()
        print(f"‚ö†Ô∏è 'file' column not found in {parquet_file}")
    except Exception as e:
        print(f"‚ùå Error reading {parquet_file}: {e}")
    return None

def getFile(parquet_file=1, file_number=1):
    parquet_file = f"data/telugu/{parquet_file}.parquet"
    try:
        df = pd.read_parquet(parquet_file)
        if {'file', 'unsanitized_normalized'}.issubset(df.columns):
            return df[df['file'] == file_number]['unsanitized_normalized'].tolist()
        print(f"‚ö†Ô∏è Required columns not found in {parquet_file}")
    except Exception as e:
        print(f"‚ùå Error reading {parquet_file}: {e}")
    return []

def getAudio(parquet_file=1, file_number=1, segment_number=1):
    parquet_file = f"data/telugu/{parquet_file}.parquet"
    try:
        df = pd.read_parquet(parquet_file)
        if {'file', 'segment', 'audio_filepath', 'samples'}.issubset(df.columns):
            row = df[(df['file'] == file_number) & (df['segment'] == segment_number)]
            if not row.empty:
                audio_data = row.iloc[0]['audio_filepath']
                if isinstance(audio_data, dict) and 'bytes' in audio_data:
                    return audio_data['bytes']
                print(f"‚ö†Ô∏è 'bytes' key not found in audio_filepath")
            else:
                print(f"‚ö†Ô∏è No matching file={file_number}, segment={segment_number}")
        else:
            print(f"‚ö†Ô∏è Required columns not found in {parquet_file}")
    except Exception as e:
        print(f"‚ùå Error reading {parquet_file}: {e}")
    return None

def play(audioStr, autoplay=True):
    audio_player = Audio(audioStr, autoplay=autoplay, rate=48000)
    display(audio_player)

def getMetadata(parquet_file=1, file_number=1, segment_number=1, columns=['lang', 'scenario', 'task_name', 'gender', 'age_group', 'area', 'district', 'state']):
    parquet_file = f"data/telugu/{parquet_file}.parquet"
    try:
        df = pd.read_parquet(parquet_file)
        if {'file', 'segment'}.issubset(df.columns) and all(col in df.columns for col in columns):
            row = df[(df['file'] == file_number) & (df['segment'] == segment_number)]
            if not row.empty:
                metadata = {col: row.iloc[0][col] for col in columns}
                return metadata
            print(f"‚ö†Ô∏è No matching file={file_number}, segment={segment_number}")
        else:
            print(f"‚ö†Ô∏è Required columns not found in {parquet_file}")
    except Exception as e:
        print(f"‚ùå Error reading {parquet_file}: {e}")
    return {}

In [25]:
parquet = 1
file = 2
print("Total Audio Files:", maxFileCount(parquet))
print("Total Segments", len(getFile(parquet, file)))

for segment in getFile(parquet, file):
    print(segment)
    play(getAudio(parquet, file, getFile(parquet, file).index(segment)+1))
    print(getMetadata(parquet, file, getFile(parquet, file).index(segment)+1))

Total Audio Files: 1064
Total Segments 1
‡∞®‡±á‡∞®‡±Å ‡∞®‡∞æ‡∞≤‡±Å‡∞ó‡±Å ‡∞é‡∞®‡∞∞‡±ç‡∞ú‡∞æ‡∞Ø‡∞∞‡±ç‡∞∏‡±ç [energiors] ‡∞Ü‡∞≤‡±ç‡∞ï‡∞≤‡±Ä‡∞®‡±ç [alkaline] ‡∞¨‡±ç‡∞Ø‡∞æ‡∞ü‡∞∞‡±Ä‡∞≤‡±Å ‡∞Ü‡∞∞‡±ç‡∞°‡∞∞‡±ç [order] ‡∞ö‡±á‡∞Ø‡∞æ‡∞≤‡∞ø ‡∞¶‡∞æ‡∞®‡±ç‡∞®‡∞ø ‡∞∑‡∞æ‡∞™‡∞ø‡∞Ç‡∞ó‡±ç [shopping] ‡∞≤‡∞ø‡∞∏‡±ç‡∞ü‡±Å‡∞ï‡∞ø ‡∞ö‡±á‡∞∞‡±ç‡∞ö‡±Å‡∞§‡∞æ‡∞µ‡∞æ


{'lang': 'te', 'scenario': 'Read', 'task_name': 'Bigbasket Commands', 'gender': 'Male', 'age_group': '60+', 'area': 'Rural', 'district': 'West Godavari', 'state': 'Andhra Pradesh'}
