In [27]:
import numpy as np
import librosa as lr
import soundfile as sf
import pandas as pd

import os

In [28]:
DATASET_DIR = "../../extracted_audio"
DEST_DIR = "../../audio_chunks"
DATAINFO_DIR = "../../dataset_info"

In [None]:
CHUNK_SIZE = 7 * 60 # 7 minutes
CHUNK_SAMPLES = CHUNK_SIZE * 22050 # 22050 samples per second

def chunk_audio(split_type):
    
    if not os.path.exists(f"{DEST_DIR}/{split_type}"):
        os.makedirs(f"{DEST_DIR}/{split_type}")
        
    files = sorted(os.listdir(f"{DATASET_DIR}/{split_type}"))
    
    # track lables
    df = pd.read_csv(f"{DATAINFO_DIR}/{split_type}_split_new.csv")
    
    stats = []
    for file in files:
        # print(f"Processing {file}")
        
        if not file.endswith(".wav"):
            continue
    
        audio, sr = lr.load(f"{DATASET_DIR}/{split_type}/{file}", sr=None)
        chunks = []
        
        num_chunks = audio.shape[0] // CHUNK_SAMPLES + (1 if audio.shape[0] % CHUNK_SAMPLES != 0 else 0)
        
        # get label for the track
        label = df[df["Participant_ID"] == file]["PHQ_Binary"].values[0]
        
        for i in range(num_chunks):
            start_idx = i * CHUNK_SAMPLES
            end_idx = start_idx + CHUNK_SAMPLES
            chunk = audio[start_idx:end_idx]
            
            # if last chunk if of 3 minutes or less => drop it 
            # if last chunk is of more than 3 minutes => pad it
            
            three_min_samples = 3 * 60 * 22050
            
            if i == num_chunks - 1:
                if chunk.shape[0] < three_min_samples:
                    continue
                elif chunk.shape[0] > three_min_samples and chunk.shape[0] < CHUNK_SAMPLES:
                    chunk = np.pad(chunk, (0, CHUNK_SAMPLES - chunk.shape[0]), "constant")
            
            chunks.append(chunk)

        stats.append({"file": file, "num_chunks": num_chunks, "label": label})
            
        # Save chunks
        for i, chunk in enumerate(chunks):
            sf.write(f"{DEST_DIR}/{split_type}/{file[:-4]}_{i}.wav", chunk, sr)
    
    return stats

In [None]:
stats = chunk_audio("train")

# save stats
df = pd.DataFrame(stats)
df.to_csv(f"{DATAINFO_DIR}/train_chunks.csv", index=False)

In [53]:
stats = chunk_audio("dev")

# save stats
df = pd.DataFrame(stats)
df.to_csv(f"{DATAINFO_DIR}/dev_chunks.csv", index=False)

In [54]:
stats = chunk_audio("test")

# save stats
df = pd.DataFrame(stats)
df.to_csv(f"{DATAINFO_DIR}/test_chunks.csv", index=False)

In [38]:
def audio_size(split_type):
    files = os.listdir(f"{DATASET_DIR}/{split_type}")
    print(len(files))
    
    sizes = []
    for file in files:
        if not file.endswith(".wav"):
            continue
        audio, sr = lr.load(f"{DATASET_DIR}/{split_type}/{file}", sr=None)
        sizes.append(audio.shape[0])
        
    sizes = np.array(sizes)
    # size in minutes
    sizes = (sizes / sr) / 60
        
    return sizes

In [39]:
sizes = audio_size("train")

275


In [40]:
# avg size in minutes
np.mean(sizes)

8.377490480052522