In [2]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.effects import normalize
from vscode_audio import Audio
from IPython.display import Audio as ipAudio
import matplotlib.pyplot as plt
import noisereduce as nr
import librosa as lr
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
RATE = 16000
TOP_DB = 25

In [4]:
def audiosegment_to_librosawav(audiosegment):    
    channel_sounds = audiosegment.split_to_mono()
    samples = [s.get_array_of_samples() for s in channel_sounds]

    fp_arr = np.array(samples).T.astype(np.float32)
    fp_arr /= np.iinfo(samples[0].typecode).max
    fp_arr = fp_arr.reshape(-1)

    return fp_arr


def librosa_to_audiosegment(filename):
    y, sr = lr.load(filename, sr=RATE)
    y, _ = lr.effects.trim(y, top_db=TOP_DB) # trim leading and trailing silence
    y = nr.reduce_noise(y, sr=sr) # noise reduction
    # convert from float to uint16
    y = np.array(y * (1<<15), dtype=np.int16)
    audio_segment = AudioSegment(
        y.tobytes(), 
        frame_rate=sr,
        sample_width=y.dtype.itemsize, 
        channels=1
    )
    return audio_segment

In [5]:
Ravdess_Path='../Datasets/Ravdess/'
ravdess=[]
for directory in os.listdir(Ravdess_Path):
    actors=os.listdir(os.path.join(Ravdess_Path,directory))
    for wav in actors:
        emotion=wav.partition('.wav')[0].split('-')
        emotion_number=int(emotion[2])
        ravdess.append((emotion_number,os.path.join(Ravdess_Path,directory,wav)))
Ravdess_df=pd.DataFrame.from_dict(ravdess)
Ravdess_df.rename(columns={0:'emotion',1:'sound'},inplace=True)
Ravdess_df['emotion'].replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'},inplace=True)
Ravdess_df.head()

Unnamed: 0,emotion,sound
0,neutral,../Datasets/Ravdess/Actor_08/03-01-02-01-01-01...
1,sad,../Datasets/Ravdess/Actor_08/03-01-04-02-02-02...
2,neutral,../Datasets/Ravdess/Actor_08/03-01-02-01-02-01...
3,happy,../Datasets/Ravdess/Actor_08/03-01-03-01-02-01...
4,fear,../Datasets/Ravdess/Actor_08/03-01-06-01-02-02...


In [5]:
# # ds = 'all_en'
# # feat_ex_technique = 'formants'

# # Defines ratios, w.r.t. whole dataset.
# ratio_train = 0.7
# ratio_val = 0.15
# ratio_test = 0.15

# def get_splits(d):
#     # Produces test split.
#     remaining, test = train_test_split(
#         d, test_size=ratio_test, stratify=d['emotion'])

#     # Adjusts val ratio, w.r.t. remaining dataset.
#     ratio_remaining = 1 - ratio_test
#     ratio_val_adjusted = ratio_val / ratio_remaining

#     # Produces train and val splits.
#     train, val = train_test_split(
#         remaining, test_size=ratio_val_adjusted, stratify=remaining['emotion'])
        
#     return train, val, test

In [6]:
# train, val, test = get_splits(Ravdess_df)
# train.shape, val.shape, test.shape

((1008, 2), (216, 2), (216, 2))

In [6]:
output_folder = '../Datasets/custom/'

def save_as_chunks(df: pd.DataFrame):
    for i, row in tqdm(df.iterrows(), total=len(df)):
        og = librosa_to_audiosegment(row['sound'])
        norm = normalize(og, headroom=5.0)
        chunks = split_on_silence(norm, min_silence_len=20, silence_thresh=norm.dBFS-16)
        emo = row['emotion']
        
        for j, chunk in enumerate(chunks):
            chunk.export(f'{output_folder}{emo}_{i}_{j}.wav', format="wav")

In [7]:
save_as_chunks(Ravdess_df)

100%|██████████| 1440/1440 [03:05<00:00,  7.77it/s]


100%|██████████| 216/216 [00:36<00:00,  5.95it/s]
100%|██████████| 216/216 [00:33<00:00,  6.36it/s]
