In [1]:
import io
import os 
import librosa
import soundfile as sf
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
tqdm.pandas()

def write_records_to_json(records, filename):
  """
  Writes a dictionary of records to a JSON file.

  Args:
    records: A dictionary where keys are speaker IDs and values are file paths.
    filename: The name of the JSON file to write to.
  """

  with open(filename, 'w') as f:
    json.dump(records, f)

def read_records_from_json(filename):
  """
  Reads a dictionary of records from a JSON file.

  Args:
    filename: The name of the JSON file to read from.

  Returns:
    A dictionary of records.
  """

  with open(filename, 'r') as f:
    return json.load(f)

def increase_volume(input_file, output_file, gain_factor=2, sr=22050):
    """
    Increase the volume of an audio file.

    Parameters:
    input_file (str): Path to the input audio file.
    output_file (str): Path to save the output louder audio file.
    gain_factor (float): Factor by which to increase the volume. Default is 1.5.
    sr (int): Sample rate for loading the audio file. Default is 22050.
    """
    # Load the speech file with the default sample rate
    y, _ = librosa.load(input_file, sr=sr)

    # Increase the volume
    y_louder = y * gain_factor

    # Ensure no values exceed the maximum amplitude to avoid clipping
    y_louder = np.clip(y_louder, -1.0, 1.0)

    # Create parent directories for the output file if they don't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # Save the louder audio
    sf.write(output_file, y_louder, sr)

speaker_metadata = read_records_from_json("metadata/speaker_metadata.json")

speakers_datasets = {}
for speaker_id, speaker_path in speaker_metadata.items():
    speakers_datasets[speaker_id] = pd.read_csv(speaker_path, index_col=0)

stat_df = {}
for speaker_id in speaker_metadata.keys():
    stat_df[speaker_id] = speakers_datasets[speaker_id].head()

In [10]:
for speaker_id in speaker_metadata.keys():
    speakers_datasets[speaker_id]["audio_filepath"].progress_apply(lambda x: increase_volume(x, "processed/{}".format(x)))

100%|██████████| 17426/17426 [29:08<00:00,  9.97it/s]
100%|██████████| 12921/12921 [18:15<00:00, 11.80it/s]
100%|██████████| 23696/23696 [31:41<00:00, 12.46it/s]
100%|██████████| 49250/49250 [1:05:40<00:00, 12.50it/s]
100%|██████████| 32903/32903 [41:33<00:00, 13.20it/s] 


In [20]:
for speaker_id in speaker_metadata.keys():
    audio_filepath = "processed/" + speakers_datasets[speaker_id]["audio_filepath"]
    count_e = audio_filepath.progress_apply(lambda x: os.path.exists(x)).sum()
    print("{}{}".format(audio_filepath.shape[0], count_e))

100%|██████████| 17426/17426 [00:42<00:00, 407.49it/s]


1742617426


100%|██████████| 12921/12921 [00:29<00:00, 430.90it/s]


1292112921


100%|██████████| 23696/23696 [00:53<00:00, 443.49it/s]


2369623696


100%|██████████| 49250/49250 [02:32<00:00, 323.34it/s]


4925049250


100%|██████████| 32903/32903 [01:17<00:00, 425.54it/s]

3290332903





In [25]:
def read_text(filepath):
    with open(filepath, "r") as file:
        return file.read()
    

for speaker_id in speaker_metadata.keys():
    texts = speakers_datasets[speaker_id]["txt_filepath"].progress_apply(lambda x: read_text(x))
    speakers_datasets[speaker_id]["texts"] = texts

100%|██████████| 17426/17426 [00:00<00:00, 40689.82it/s]
100%|██████████| 12921/12921 [00:00<00:00, 40149.26it/s]
100%|██████████| 23696/23696 [00:00<00:00, 38922.89it/s]
100%|██████████| 49250/49250 [00:10<00:00, 4842.38it/s] 
100%|██████████| 32903/32903 [00:27<00:00, 1203.54it/s]


In [28]:
for speaker_id in speaker_metadata.keys():
    del speakers_datasets[speaker_id]["txt_filepath"]
    del speakers_datasets[speaker_id]["missing"]

In [33]:
for speaker_id in speaker_metadata.keys():
    filename = "metadata/{}_manifest.csv".format(speaker_id)
    speakers_datasets[speaker_id].to_csv(filename, index=False)

In [34]:
for speaker_id in speaker_metadata.keys():
    filename = "metadata/{}_manifest.csv".format(speaker_id)
    speaker_metadata[speaker_id] = filename

In [15]:
("processed/" + speakers_datasets[speaker_id]["audio_filepath"]).progress_apply(lambda x: os.path.exists(x)).su

32903

In [3]:
speakers_datasets = {}
for speaker_id in speaker_metadata.Speaker_ID:
    wavs = ! find $speaker_id/ -type f -name *.wav
    speakers_datasets[speaker_id] = pd.Series(wavs)

In [4]:
import os
for speaker_id in speaker_metadata.Speaker_ID:
    file_count = len(speakers_datasets[speaker_id])
    checked_exists_audio = speakers_datasets[speaker_id].apply(lambda x: os.path.exists(x)).sum()
    checked_exists_text = speakers_datasets[speaker_id].apply(lambda x: x.replace(".wav", ".txt").replace("Audio", "Transcripts")).apply(lambda x: os.path.exists(x)).sum()
    print("checked {} {} {} {}".format(file_count, checked_exists_audio, checked_exists_text, speaker_id))
    if not (file_count == checked_exists_audio == checked_exists_text): print("finded missing files {}".format(speaker_id))


checked 17426 17426 17426 F1
checked 12946 12946 12921 F2
finded missing files F2
checked 23696 23696 23696 F3
checked 49250 49250 49250 M1
checked 32903 32903 32903 M2


In [5]:
for speaker_id in speaker_metadata.Speaker_ID:
    df = pd.DataFrame(speakers_datasets[speaker_id], columns=["audio_filepath"])
    df["txt_filepath"] = df.audio_filepath.apply(lambda x: x.replace(".wav", ".txt").replace("Audio", "Transcripts"))
    speakers_datasets[speaker_id] = df

In [6]:
for speaker_id in speaker_metadata.Speaker_ID:
    speakers_datasets[speaker_id]["missing"] = ~speakers_datasets[speaker_id].apply(lambda x: os.path.exists(x.audio_filepath) and os.path.exists(x.txt_filepath), axis=1)

In [9]:
for speaker_id in speaker_metadata.Speaker_ID:
    print(speakers_datasets[speaker_id]["missing"].sum())
    

0
0
0
0
0


In [10]:
speakers_datasets['F1']

Unnamed: 0,audio_filepath,txt_filepath,missing
0,F1/Audio/inform_3356_11.wav,F1/Transcripts/inform_3356_11.txt,False
1,F1/Audio/inform_2803_08.wav,F1/Transcripts/inform_2803_08.txt,False
2,F1/Audio/inform_1163_032.wav,F1/Transcripts/inform_1163_032.txt,False
3,F1/Audio/inform_1960_022.wav,F1/Transcripts/inform_1960_022.txt,False
4,F1/Audio/inform_2022_07.wav,F1/Transcripts/inform_2022_07.txt,False
...,...,...,...
17421,F1/Audio/inform_805_04.wav,F1/Transcripts/inform_805_04.txt,False
17422,F1/Audio/inform_3407_12.wav,F1/Transcripts/inform_3407_12.txt,False
17423,F1/Audio/inform_800_02.wav,F1/Transcripts/inform_800_02.txt,False
17424,F1/Audio/inform_1129_01.wav,F1/Transcripts/inform_1129_01.txt,False


In [11]:
for speaker_id in speaker_metadata.Speaker_ID:
    print(speaker_id, speakers_datasets[speaker_id].shape)

F1 (17426, 3)
F2 (12921, 3)
F3 (23696, 3)
M1 (49250, 3)
M2 (32903, 3)


In [13]:
df_missing.to_csv("metadata/F2_missing_files.csv")

In [14]:
stat_df = {}
for speaker_id in speaker_metadata.Speaker_ID:
    stat_df[speaker_id] = speakers_datasets[speaker_id].head()