In [2]:
from pydub import AudioSegment
from pydub.playback import play
import re
from pathlib import Path
import typing as T
import numpy as np
import librosa as lr
import soundfile as sf
import jsonlines 

In [5]:

def split_audio(
    segment: AudioSegment,
    output_dir: str,
    duration_ms: int = 5120,
    extension: str = "wav",
    min_length_ms: int = 1000
):
    """
    Slice an audio file into clips of the given duration.
    """
    output_dir_path = Path(output_dir)
    if not output_dir_path.exists():
        output_dir_path.mkdir(parents=True)

    num_clips_existing = len(list(Path(output_dir_path).glob(f'*.{extension}')))

    segment_duration_ms = int(segment.duration_seconds * 1000)
    num_clips = segment_duration_ms//duration_ms + 1

    meta_data_list = []
    for i in range(num_clips):
        clip_start_ms = i*duration_ms
        clip_end_ms = min(clip_start_ms+duration_ms, segment_duration_ms)
        act_duration_ms = clip_end_ms - clip_start_ms
        if act_duration_ms >= min_length_ms: 
            clip = segment[clip_start_ms : clip_end_ms]

            # clip_name = f"clip_{i+num_clips_existing}_start_{clip_start_ms}_ms_duration_{act_duration_ms}_ms.{extension}"
            clip_name = f"clip_{i+num_clips_existing}.{extension}"
            clip_path = output_dir_path / clip_name
            clip.export(clip_path, format=extension)
            meta_data = {'start_ms': clip_start_ms, 'end_ms': clip_end_ms, 
                         'file_name': str(clip_path)} 
            meta_data_list.append(meta_data)
    return meta_data_list
    

def load(path: str): 
    segment = AudioSegment.from_file(path)
    segment = segment.set_channels(1)
    return segment

def update_dict_list(list_dicts: T.List[T.Dict[str, str]], update_dict: T.Dict[str, str]):
    for d in list_dicts:
        d.update(update_dict)
    return list_dicts

def write_metadata(mdata: T.List[T.Dict[str, str]], output_dir: str) -> None: 
    mdata_path = Path(output_dir) / 'meta_data.jsonl'
    with jsonlines.open(mdata_path, 'w') as writer: 
        writer.write_all(mdata)
    return mdata_path



This script will create the audio data set and write out the metadata. 

For each file we: 
1. split into segments of between 1000 and 5120 ms. 
2. offset the clip by 1000, 2000, 3000 seconds and do the same splitting. 
3. change the pitch by ...

In [10]:
files = list(Path('./processed/').rglob('*.wav'))

#
out_max_duration_ms = 5120
out_min_duration_ms = 1000

# Data augmentation
offsets_ms = np.array([1, 2, 3])*1000

out_dir = './audio'
for path in Path(out_dir).glob('*'):
    path.unlink()

all_metadata = []

for file in files: 
    mdata = {'source_file': str(file), 'prompt': f'Solo Guzheng in the {file.parent.stem} style', 'augmentation': None, 'augmentation_param': None, 'augmentation_value': None}
    segment = load(file)

    clips_mdata = split_audio(segment, output_dir=out_dir, duration_ms=out_max_duration_ms, min_length_ms=out_min_duration_ms)
    clips_mdata = update_dict_list(clips_mdata, mdata)
    all_metadata.extend(clips_mdata)

    if len(segment) > out_min_duration_ms + max(offsets_ms): 
        mdata['augmentation'] = 'offset'
        mdata['augmentation_param'] = 'offset_ms'
        for offset in offsets_ms: 
            mdata['augmentation_value'] = int(offset)           
            segment_offset = segment[offset:]
    
            clips_mdata = split_audio(segment_offset, output_dir=out_dir, duration_ms=out_max_duration_ms, min_length_ms=out_min_duration_ms)
            clips_mdata = update_dict_list(clips_mdata, mdata)
            all_metadata.extend(clips_mdata)




write_metadata(all_metadata, output_dir=out_dir)


PosixPath('audio/meta_data.jsonl')

In [44]:
clips_mdata

[{'start_ms': 0,
  'end_ms': 5120,
  'file_name': 'audio/clip_0.wav',
  'source_file': 'processed/Alienated/alienated 2 靖沐第一次錄音.wav',
  'prompt': 'Solo Guzheng in the Alienated style'},
 {'start_ms': 5120,
  'end_ms': 10240,
  'file_name': 'audio/clip_1.wav',
  'source_file': 'processed/Alienated/alienated 2 靖沐第一次錄音.wav',
  'prompt': 'Solo Guzheng in the Alienated style'},
 {'start_ms': 10240,
  'end_ms': 10439,
  'file_name': 'audio/clip_2.wav',
  'source_file': 'processed/Alienated/alienated 2 靖沐第一次錄音.wav',
  'prompt': 'Solo Guzheng in the Alienated style'}]

In [30]:
x.update(y)

In [31]:
x

{'1': 1, '2': 2, '3': 3, '4': 4}

In [32]:
y['3'] = 'three'

In [33]:
x

{'1': 1, '2': 2, '3': 3, '4': 4}

In [34]:
y

{'3': 'three', '4': 4}