In [None]:
# Extract the words with filtering by the 587 in Kell et al., paper 
# It will only generate ~7000 ish 
import subprocess
import os
import pandas as pd
from pydub import AudioSegment

def load_ids_and_words(csv_file):
    df = pd.read_csv(csv_file, header=None) 
    file_ids = pd.unique(df.iloc[:, 2:].values.ravel('K'))
    file_ids = [x for x in file_ids if str(x) != 'nan']
    target_words = df.iloc[:, 0].tolist()
    return file_ids, target_words

def find_audio_files(root_path, file_ids):
    """
    Perform an exhaustive search for all occurrences of specified file IDs.

    :param root_path: The root directory where TRAIN and TEST are stored.
    :param file_ids: A set of file IDs to search for.
    :return: A dictionary where each file ID maps to a list of matching file paths.
    """
    found_files = {file_id: [] for file_id in file_ids}

    # Walk through all subdirectories (TRAIN, TEST, DR1-DR8, Speaker Folders)
    for root, _, files in os.walk(root_path):
        file_set = set(files)  # Convert to set for faster lookup
        
        for file in files:
            file_id = file.split('.')[0]  # Extract file ID before the extension

            # Check if the file is a .WAV file and its ID is in the search list
            if file_id in file_ids and file.endswith('.WAV'):
                # print(file_id)
                corresponding_wrd = file_id + '.WRD'
                # print(corresponding_wrd)

                # Ensure the corresponding .WRD file exists
                if corresponding_wrd in file_set:
                    found_files[file_id].append(os.path.join(root, file))

    # Remove entries that were not found at all
    found_files = {key: val for key, val in found_files.items() if val}

    return found_files



def extract_audio_clip(wrd_file, wav_file, target_word, output_folder):
    try:
        with open(wrd_file, 'r', encoding='latin1') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) >= 3 and parts[2] == target_word:
                    start_sample = int(parts[0])
                    end_sample = int(parts[1])
                    break
            else:
                return None  # Target word not found
    except UnicodeDecodeError:
        print("Encoding error in reading the WRD file")
        return None

    # Convert sample points to seconds (assuming 16000 Hz sample rate)
    start_sec = start_sample / 16000
    end_sec = end_sample / 16000

    # Add 100ms padding before and after the word
    padding = 0.1  # 100 milliseconds
    start_sec = max(0, start_sec - padding)  # Ensure start_sec is not negative

    # Fetch the total duration of the audio file to ensure end_sec does not exceed it
    try:
        result = subprocess.run(['sox', '--i', '-D', wav_file], text=True, capture_output=True, check=True)
        total_duration_sec = float(result.stdout.strip())
    except subprocess.CalledProcessError as e:
        print(f"Failed to get duration of audio file. Error: {e}")
        return None

    end_sec = min(end_sec + padding, total_duration_sec)  # Ensure end_sec does not exceed the audio length
    duration_sec = end_sec - start_sec

    # Construct the output file path
    folder_name = os.path.basename(os.path.dirname(wav_file))
    output_filename = f"{target_word}_{folder_name}_{os.path.basename(wav_file)}"
    output_path = os.path.join(output_folder, output_filename)

    # Build sox command to trim the audio
    command = [
        'sox', wav_file, output_path, 'trim', str(start_sec), str(duration_sec)
    ]

    # Execute sox command
    try:
        subprocess.run(command, check=True)
        print(f"Successfully extracted {output_path}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to extract audio segment. Error: {e}")
        return None

    return output_path

# Parameters
csv_file_path = 'D:\\DNN\\Training_data\\TIMIT\\high_freq_words.csv'
base_path = 'D:\\DNN\\Training_data\\TIMIT\\raw\\timit\\data'  # Updated base path
output_folder = 'D:\\DNN\\Training_data\\TIMIT\\Extracted_Clips_new'

# Load file IDs and target words from CSV
file_ids, target_words = load_ids_and_words(csv_file_path)


# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)

# Find and process files
found_files = find_audio_files(base_path, file_ids)
print(found_files)

for file_id, paths in found_files.items():
    for path in paths:
        wrd_file = path.replace('.WAV', '.WRD')
        print(wrd_file)
        if os.path.exists(wrd_file):
            for target_word in target_words:
                extracted_clip_path = extract_audio_clip(wrd_file, path, target_word, output_folder)
                if extracted_clip_path:
                    print(f"Extracted clip saved as: {extracted_clip_path}")


In [None]:
import os
import pandas as pd
import subprocess

# Define paths
root_dir = "D:\\DNN\\Training_data\\TIMIT\\raw\\timit\\data"  # Change this to your actual data folder
output_audio_dir = "D:\\DNN\\Training_data\\TIMIT\\Extracted_Clips_all_words"  # Folder for extracted clips
os.makedirs(output_audio_dir, exist_ok=True)

# Dictionary to count word occurrences
word_counts = {}

# Loop through all WRD files in the directory recursively
for root, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.WRD'):
            wrd_file = os.path.join(root, file)
            wav_file = wrd_file.replace('.WRD', '.WAV')

            if os.path.exists(wav_file):
                with open(wrd_file, "r", encoding="latin1") as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) == 3:
                            start_sample, end_sample, word = int(parts[0]), int(parts[1]), parts[2]

                            # Update word count
                            word_counts[word] = word_counts.get(word, 0) + 1

                            # Convert sample points to seconds (assuming 16000 Hz sample rate)
                            start_sec = max(0, start_sample / 16000 - 0.05)  # 50ms padding before
                            end_sec = end_sample / 16000 + 0.05  # 50ms padding after

                            # Fetch total duration to avoid exceeding limits
                            try:
                                result = subprocess.run(['sox', '--i', '-D', wav_file], text=True, capture_output=True, check=True)
                                total_duration_sec = float(result.stdout.strip())
                                end_sec = min(end_sec, total_duration_sec)  # Ensure end_sec doesn't exceed audio length
                            except subprocess.CalledProcessError:
                                print(f"Error retrieving duration for {wav_file}")
                                continue

                            # Construct the output file path
                            folder_name = os.path.basename(os.path.dirname(wav_file))
                            output_filename = f"{word}_{folder_name}_{os.path.basename(wav_file)}"
                            output_path = os.path.join(output_audio_dir, output_filename)

                            # Extract the audio clip using SoX
                            command = ['sox', wav_file, output_path, 'trim', str(start_sec), str(end_sec - start_sec)]
                            try:
                                subprocess.run(command, check=True)
                                print(f"Extracted: {output_path}")
                            except subprocess.CalledProcessError:
                                print(f"Failed to extract audio segment for {word} in {wav_file}")

# Convert word frequency counts to a DataFrame and save as CSV
word_count_df = pd.DataFrame(list(word_counts.items()), columns=["Word", "Count"])
word_count_df.to_csv(os.path.join(root_dir, "word_frequencies.csv"), index=False)

print("Processing complete. Extracted all words with 50ms padding and saved word frequencies.")


In [None]:
# THis scipt can loop through all of the TIMIT dataset and generate the clean word excerpts
# this can create more than 33,000 clips
import os
import pandas as pd
import shutil

# Define directories
source_dir = r"D:\DNN\Training_data\TIMIT\Extracted_Clips_all_words"
destination_dir = r"D:\DNN\Training_data\TIMIT\Extracted_Clips_training"

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Load the word frequencies CSV
csv_path = r"D:\DNN\Training_data\TIMIT\word_frequencies.csv"  # Update if needed
df = pd.read_csv(csv_path)

# Extract words that are **4 or more characters long**
df_filtered = df[df["Word"].str.len() >= 4]  # Changed to `>= 4`

# Move corresponding files
moved_files = []
for word in df_filtered["Word"]:
    for file in os.listdir(source_dir):
        if file.startswith(word + "_"):  # Match extracted audio files
            src_path = os.path.join(source_dir, file)
            dst_path = os.path.join(destination_dir, file)
            shutil.move(src_path, dst_path)
            moved_files.append(file)

print(f"Moved {len(moved_files)} files to {destination_dir}")

