In [11]:
import pandas as pd
import librosa
import os

In [12]:
df = pd.read_csv('./data/results_wav.csv', sep=';')

df.head()

Unnamed: 0,title,region,keyword,duration,url,path,wav_path
0,KICIR KICIR - Lagu dan Tari Nusantara - Lagu Anak,Jakarta,Kicir-Kicir asal Jakarta,3.54,https://www.youtube.com/watch?v=0wTJULU2REc&pp...,datasets/songs/kicir_kicir_lagu_dan_tari_nusan...,datasets/wav_songs/kicir_kicir_lagu_dan_tari_n...
1,senam kicir kicir lomba 2024,Jakarta,Kicir-Kicir asal Jakarta,3.35,https://www.youtube.com/watch?v=NzWTp-eX02g&pp...,datasets/songs/senam_kicir_kicir_lomba_2024.mp3,datasets/wav_songs/senam_kicir_kicir_lomba_202...
2,Kicir Kicir Jakarta,Jakarta,Kicir-Kicir asal Jakarta,3.52,https://www.youtube.com/watch?v=jMynoFKskhc&pp...,datasets/songs/kicir_kicir_jakarta.mp3,datasets/wav_songs/kicir_kicir_jakarta.wav
3,ONDEL ONDEL | Lagu Daerah DKI Jakarta - Betawi...,jakarta,Ondel Ondel asal jakarta,3.39,https://www.youtube.com/watch?v=wardyOl-EHo&pp...,datasets/songs/ondel_ondel_lagu_daerah_dki_jak...,datasets/wav_songs/ondel_ondel_lagu_daerah_dki...
4,ONDEL ONDEL 💞 LAGU DAERAH DKI JAKARTA | ONDEL ...,jakarta,Ondel Ondel asal jakarta,4.44,https://www.youtube.com/watch?v=_X39qcf41ZU&pp...,datasets/songs/ondel_ondel_lagu_daerah_dki_jak...,datasets/wav_songs/ondel_ondel_lagu_daerah_dki...


In [13]:
def get_duration(file_path):
    if file_path is None:
        print('skipping file path is None')
        return None, None
    
    y, sr = librosa.load(file_path)
    duration = librosa.get_duration(y=y, sr=sr)
    print(f'sample rate: {sr}, duration: {duration}')
    return sr, duration

In [14]:
df['sample_rate'], df['duration_sec'] = zip(*df['wav_path'].map(get_duration))


sample rate: 22050, duration: 233.89460317460316
sample rate: 22050, duration: 214.8774603174603
sample rate: 22050, duration: 231.29396825396825
sample rate: 22050, duration: 218.17469387755102
sample rate: 22050, duration: 283.39954648526077
sample rate: 22050, duration: 235.7986394557823
sample rate: 22050, duration: 233.08190476190475
sample rate: 22050, duration: 105.62757369614512


In [15]:
df.describe()

Unnamed: 0,duration,sample_rate,duration_sec
count,8.0,8.0,8.0
mean,3.35,22050.0,219.518549
std,0.837257,0.0,50.525268
min,1.46,22050.0,105.627574
25%,3.38,22050.0,217.350385
50%,3.53,22050.0,232.187937
75%,3.545,22050.0,234.370612
max,4.44,22050.0,283.399546


In [None]:
import matplotlib.pyplot as plt

for index, row in df.iterrows():
    y, sr = librosa.load(row['wav_path'])
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(y, sr=sr)
    plt.title(f"Waveform of {row['title']}")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.show()

In [17]:
import os
import re
from pydub import AudioSegment

def sanitize_filename(filename):
    """
    Removes or replaces invalid characters from a filename or folder name.
    """
    return re.sub(r'[<>:"/\\|?*]', '', filename)

def split_audio(wav_path, output_dir, segment_duration=30):
    audio = AudioSegment.from_wav(wav_path)
    
    total_duration = len(audio) / 1000 
    
    num_segments = int(total_duration // segment_duration)
    
    os.makedirs(output_dir, exist_ok=True)
    
    for i in range(num_segments):
        start_time = i * segment_duration * 1000 
        end_time = (i + 1) * segment_duration * 1000 
        segment = audio[start_time:end_time]
        
        segment_file = os.path.join(output_dir, f"{os.path.basename(wav_path).replace('.wav', '')}segment{i + 1}.wav")
        segment.export(segment_file, format="wav")
        print(f"Saved segment {i + 1} at: {segment_file}")

In [None]:
input_folder = "datasets/wav_songs"
output_folder = "datasets/cut_wav_songs"

wav_files = [f for f in os.listdir(input_folder) if f.endswith('.wav')]

for wav_file in wav_files:
    wav_path = os.path.join(input_folder, wav_file)
    
    sanitized_title = sanitize_filename(os.path.splitext(wav_file)[0])
    output_dir = os.path.join(output_folder, sanitized_title)
    
    split_audio(wav_path, output_dir, segment_duration=30)

datasets/wav_songs/senam_kicir_kicir_lomba_2024.wav
datasets/wav_songs/lagu_manuk_dadali.wav
datasets/wav_songs/kicir_kicir_jakarta.wav
datasets/wav_songs/lagu_daerah_manuk_dadali_dan_lirik_apa_ya_makna_lagu_daerah_kita.wav
datasets/wav_songs/ondel_ondel_lagu_daerah_dki_jakarta_betawi_budaya_indonesia_dongeng_kita.wav
datasets/wav_songs/aty_surya_manuk_dadali.wav
datasets/wav_songs/kicir_kicir_lagu_dan_tari_nusantara_lagu_anak.wav
datasets/wav_songs/ondel_ondel_lagu_daerah_dki_jakarta_ondel_ondel_betawi_remix.wav
