In [11]:
def convert_to_hms(duration):
    # Convert the float to a string with a fixed number of decimal places
    duration_str = f"{duration:.3f}"
    
    # Split the duration into seconds and milliseconds
    seconds, milliseconds = map(int, duration_str.split('.'))
    
    # Convert total time to seconds (including milliseconds as a fraction of a second)
    total_seconds = seconds + milliseconds / 1000
    
    # Convert seconds to hours, minutes, and seconds
    hours = int(total_seconds // 3600)
    total_seconds %= 3600
    minutes = int(total_seconds // 60)
    seconds = total_seconds % 60
    
    # Format the result as hour:minute:second.millisecond
    return f"{hours:02}:{minutes:02}:{seconds:06.3f}"


In [1]:
import boto3

# Initialize S3 client
s3 = boto3.client('s3')

# S3 bucket and folder details
bucket_name = 'monlam.ai.stt'
folder_name = 'wav16k/'

# List to store file names
file_names = []

# Traverse the S3 folder and collect all file names
paginator = s3.get_paginator('list_objects_v2')
for page in paginator.paginate(Bucket=bucket_name, Prefix=folder_name):
    for obj in page.get('Contents', []):
        key = obj['Key']
        file_name = key.split('/')[-1]
        if file_name:  # Ensure it's not a folder
            file_names.append(file_name)

# Optionally, save the list to a text file
with open('s3_file_list.txt', 'w') as f:
    for name in file_names:
        f.write(f"{name}\n")

print(f"Total files found: {len(file_names)}")


Total files found: 1636705


In [1]:
import pandas as pd
import re
from tqdm import tqdm

# Input CSV file containing the list of files
input_csv = 'wav16k.csv'  # Update with your actual file name 
output_csv = 'audio_data/audio_segments_1.csv'

# Load the input CSV into a pandas DataFrame
df = pd.read_csv(input_csv)

In [2]:
len(df)

1636705

In [3]:
# Initialize lists to store output data
file_names = []
segment_audio_file_names = []
segment_audio_durations = []
total_audio_segment_lengths = []
department_names = []

# Initialize a dictionary to store total durations
total_durations = {}

In [4]:
# Regular expression pattern to parse file names 
pattern = re.compile(r"^(STT_\w{2})(.+?)_(\d+)_(\d+)_to_(\d+)\.wav$")

In [5]:
#non matching files pattern to parse file names
#pattern = re.compile(r"^(STT_\w{2})(\d{5})_(\d+)\.(\d+)-(\d+)\.(\d+)\.wav$")


In [11]:
"""
run this code to get the non matching files details
non_matching_files = []

# Process each file name
for file_name in tqdm(df['file_name'], desc="Processing files", unit="file"):
    match = pattern.match(file_name)
    if match:
        department_name = match.group(1)  # Department name
        base_file_name = f"{department_name}{match.group(2)}"  # Base file name
        start_ms = int(match.group(3)) * 1000 + int(match.group(4))
        end_ms = int(match.group(5)) * 1000 + int(match.group(6))

        # Calculate segment duration in seconds and milliseconds
        segment_duration_ms = end_ms - start_ms
        segment_duration = segment_duration_ms / 1000.0  # Convert to seconds

        # Update the total duration for the file
        if base_file_name not in total_durations:
            total_durations[base_file_name] = 0
        total_durations[base_file_name] += segment_duration

        # Append data to the lists
        file_names.append(base_file_name)
        segment_audio_file_names.append(file_name)
        segment_audio_durations.append(segment_duration)
        total_audio_segment_lengths.append(total_durations[base_file_name])
        department_names.append(department_name)
    else:
        # Add the non-matching file name to the list
        non_matching_files.append(file_name)

"""

Processing files: 100%|██████████| 362854/362854 [00:00<00:00, 457919.30file/s]


In [6]:
non_matching_files = []

# Process each file name
for file_name in tqdm(df['file_name'], desc="Processing files", unit="file"):
    match = pattern.match(file_name)
    if match:
        department_name = match.group(1)  # First five letters as department name
        base_file_name = f"{department_name}{match.group(2)}"   # Base file name
        start_ms = int(match.group(4))
        end_ms = int(match.group(5))

        # Calculate segment duration in seconds
        segment_duration = (end_ms - start_ms) / 1000.0

        # Update the total duration for the file
        if base_file_name not in total_durations:
            total_durations[base_file_name] = 0
        total_durations[base_file_name] += segment_duration

        # Append data to the lists
        file_names.append(base_file_name)
        segment_audio_file_names.append(file_name)
        segment_audio_durations.append(segment_duration)
        total_audio_segment_lengths.append(total_durations[base_file_name])
        department_names.append(department_name)
    else:
        # Add the non-matching file name to the list
        non_matching_files.append(file_name)



Processing files: 100%|██████████| 1636705/1636705 [00:02<00:00, 675310.55file/s]


In [7]:
non_matching_csv = 'audio_data/non_matching_files.csv'
# If there are non-matching files, save them to a separate CSV file
if non_matching_files:
    non_matching_df = pd.DataFrame(non_matching_files, columns=['file_name'])
    non_matching_df.to_csv(non_matching_csv, index=False)
    print(f"Non-matching file names have been saved to {non_matching_csv}", len(non_matching_files))

print(f"CSV file {output_csv} created successfully.")


Non-matching file names have been saved to non_matching_files.csv 362854
CSV file audio_segments.csv created successfully.


In [8]:
# Create a new DataFrame with the results
output_df = pd.DataFrame({
    'file_name': file_names,
    'segment_audio_file_name': segment_audio_file_names,
    'segment_audio_duration': segment_audio_durations,
    'department_name': department_names
})

# Save the results to a new CSV file
output_df.to_csv(output_csv, index=False)

print(f"CSV file {output_csv} created successfully.")

CSV file audio_segments.csv created successfully.


In [9]:
len(output_df)

1273851

In [12]:
# Convert the dictionary to a pandas DataFrame
df_total_durations = pd.DataFrame(list(total_durations.items()), columns=['file_name', 'total_audio_segment_length'])

df_total_durations = df_total_durations.sort_values(by='file_name')

df_total_durations['total_audio_segment_length'] = df_total_durations['total_audio_segment_length'].apply(convert_to_hms)

# Output CSV file
output_csv = 'audio_data/audio_durations_1.csv'

# Save the DataFrame to a CSV file
df_total_durations.to_csv(output_csv, index=False)

print(f"Total audio durations CSV file {output_csv} created successfully.")

Total audio durations CSV file total_audio_durations_1.csv created successfully.


In [21]:
df = pd.read_csv('total_audio_duration.csv')

In [22]:
df.head()

Unnamed: 0,file_name,total_audio_segment_length
0,STT_AB00001,00:05:14.681
1,STT_AB00002,00:03:01.387
2,STT_AB00003,00:09:27.663
3,STT_AB00004,00:02:15.507
4,STT_AB00005,00:10:31.105


In [23]:
pattern = re.compile(r"^(STT_\w{2})(.+?)$")

In [24]:
def extract_department_name(file_name):
    match = pattern.match(file_name)
    if match:
        return match.group(1)  # Return the department name (e.g., STT_CS)
    return None

In [25]:
df['dept'] = df['file_name'].apply(extract_department_name)

In [26]:
df.head()

Unnamed: 0,file_name,total_audio_segment_length,dept
0,STT_AB00001,00:05:14.681,STT_AB
1,STT_AB00002,00:03:01.387,STT_AB
2,STT_AB00003,00:09:27.663,STT_AB
3,STT_AB00004,00:02:15.507,STT_AB
4,STT_AB00005,00:10:31.105,STT_AB


In [29]:
# Group rows by department
grouped = df.groupby('dept')

# Create a CSV file for each department
for dept_name, group in grouped:
    # Define the file name based on the department name
    file_name = f"dept_files/{dept_name}.csv"
    # Save the group to a CSV file
    group.to_csv(file_name, index=False)
    print(f"Created file: {file_name}")

Created file: dept_files/STT_AB.csv
Created file: dept_files/STT_CS.csv
Created file: dept_files/STT_HS.csv
Created file: dept_files/STT_MV.csv
Created file: dept_files/STT_NS.csv
Created file: dept_files/STT_NW.csv
Created file: dept_files/STT_PC.csv
Created file: dept_files/STT_TT.csv
