In [1]:
import os
#import librosa
import numpy as np
import soundfile as sf
import time
from IPython.display import Audio
#from pydub import AudioSegment
#from pydub.playback import play
import random
#import sounddevice as sd

## Check device index

In [2]:
import pyaudio

p = pyaudio.PyAudio()

for i in range(p.get_device_count()):
    device_info = p.get_device_info_by_index(i)
    print(f"Device index: {i}, Device name: {device_info['name']}")

p.terminate()

Device index: 0, Device name: Microsoft Sound Mapper - Input
Device index: 1, Device name: Microphone (High Definition Aud
Device index: 2, Device name: Microsoft Sound Mapper - Output
Device index: 3, Device name: M237WDP (NVIDIA High Definition
Device index: 4, Device name: BenQ GW2470 (NVIDIA High Defini
Device index: 5, Device name: Primary Sound Capture Driver
Device index: 6, Device name: Microphone (High Definition Audio Device)
Device index: 7, Device name: Primary Sound Driver
Device index: 8, Device name: M237WDP (NVIDIA High Definition Audio)
Device index: 9, Device name: BenQ GW2470 (NVIDIA High Definition Audio)
Device index: 10, Device name: BenQ GW2470 (NVIDIA High Definition Audio)
Device index: 11, Device name: M237WDP (NVIDIA High Definition Audio)
Device index: 12, Device name: Microphone (High Definition Audio Device)
Device index: 13, Device name: Microphone (HD Audio Microphone)
Device index: 14, Device name: Output (NVIDIA High Definition Audio)
Device index: 15,

# Recording audio dataset
## Definitions

In [None]:
# Choose your desired sample rate (standard is 44100 Hz)
SAMPLE_RATE = 16000
# Choose your desired duration of recording in seconds
DURATION = 1
# Classes to record
CLASSES = ["void", "on", "off", "scene1", "scene2", "scene3", "ambient", "ambient_theme", "light_on"]
# Samples dir:
SAMPLES_DIR = "audio_samples"
if os.path.exists(SAMPLES_DIR) == False:
    os.mkdir(SAMPLES_DIR)

SAMPLE_COUNT = 20
DEVICE_INDEX = 1
CHANNELS = 1
CHUNK = 1024
FORMAT = pyaudio.paInt16

## A) Function - record and collect samples
- Collect all in one folder
- Samples are named "class_name_1.wav", "class_name_2.wav" ...

In [None]:
# Audio data collection function
# All in one Folder:
def record_audio(p, SAMPLE_RATE, FORMAT, CHANNELS, CHUNK, DEVICE_INDEX, DURATION, CLASSES):
    for cls in CLASSES:
        print("Recording class: ", cls)
        
        for sample in range(SAMPLE_COUNT):
            print(f"Recording sample no.: {sample} / {SAMPLE_COUNT}")
            time.sleep(0.5)

            stream = p.open(format = FORMAT,
                            channels = CHANNELS,
                            rate = SAMPLE_RATE,
                            input = True,
                            frames_per_buffer = CHUNK,
                            input_device_index = DEVICE_INDEX)
            frames = []

            for i in range(0, int(SAMPLE_RATE / CHUNK * DURATION)):
                data = stream.read(CHUNK)
                frames.append(np.frombuffer(data, dtype=np.int16))
            
            stream.stop_stream()
            stream.close()
            
            """
            #playback
            print("Playback")
            
            stream = p.open(format = FORMAT,
                            channels = CHANNELS,
                            rate = SAMPLE_RATE,
                            output = True,
                            frames_per_buffer = CHUNK)
            
            for frame in frames:
                stream.write(frame)

            stream.stop_stream()
            stream.close()
            """

            record = np.concatenate(frames, axis=0)
            record = np.squeeze(record)
            # trimmed, index = librosa.effects.trim(record, top_db=15)


            #record = librosa.util.normalize(record)

            filename = os.path.join(SAMPLES_DIR, f"{cls}_{sample}.wav")
            sf.write(filename, record, SAMPLE_RATE)
            print("Saved at: ", filename)
            
    p.terminate()
    print("Finished recording.")


In [6]:
# Initialize PyAudio
p = pyaudio.PyAudio()

record_audio(p, SAMPLE_RATE, FORMAT, CHANNELS, CHUNK, DEVICE_INDEX, DURATION, CLASSES)


Saved at:  audio_samples\void_11.wav
Recording sample no.: 12 / 20
Saved at:  audio_samples\void_12.wav
Recording sample no.: 13 / 20
Saved at:  audio_samples\void_13.wav
Recording sample no.: 14 / 20
Saved at:  audio_samples\void_14.wav
Recording sample no.: 15 / 20
Saved at:  audio_samples\void_15.wav
Recording sample no.: 16 / 20
Saved at:  audio_samples\void_16.wav
Recording sample no.: 17 / 20
Saved at:  audio_samples\void_17.wav
Recording sample no.: 18 / 20
Saved at:  audio_samples\void_18.wav
Recording sample no.: 19 / 20


Saved at:  audio_samples\void_19.wav
Recording class:  on
Recording sample no.: 0 / 20


## A) Audio Augmentation
- Take original samples and multiply them with applying augmentation methods.
- Takes samples from common folder and puts augmented ones in another folder.

In [None]:

SAMPLES_DIR = "audio_samples"
AUG_SAMPLES_DIR = "augmented_audio_samples"
NUM_AUGMENTED = 25

In [None]:
class_counts = {}

for file in os.listdir(SAMPLES_DIR):
    #Load file
    audio_file = os.path.join(SAMPLES_DIR, file)
    print(audio_file)
    y, sr = librosa.load(audio_file, sr=None)

    # Get class name and count
    class_name = file.split("_", 1)[0]
    if class_name not in class_counts:
        class_counts[class_name] = 0

    os.makedirs(AUG_SAMPLES_DIR, exist_ok=True)

    for i in range(NUM_AUGMENTED):
        # Augment image
        method = random.choice(["pitch", "stretch", "noise", "db"])

        if method == "pitch":
            steps = random.randint(-1, 1)  
            augmented = librosa.effects.pitch_shift(y, sr=sr, n_steps=steps)

        elif method == "stretch":
            rate = random.uniform(0.9, 1.1)  # Random rate for time stretching
            augmented = librosa.effects.time_stretch(y, rate=rate)

        elif method == "noise":
            noise = np.random.normal(0, 0.01, len(y))  # Gaussian noise
            augmented = y + noise

        elif method == "db":
            audio_segment = AudioSegment.from_wav(audio_file)
            db_change = random.randint(-10, 10)  # Random change in volume (dB)
            augmented_segment = audio_segment + db_change
            augmented = np.array(augmented_segment.get_array_of_samples())

        class_counts[class_name] += 1
        new_file = os.path.join(AUG_SAMPLES_DIR, f"{file.split('_', 1)[0]}_{class_counts[class_name]}.wav")
        sf.write(new_file, augmented, SAMPLE_RATE)

## Generate metadata.csv (optional) 

In [None]:
import os
import pandas as pd

# Define the directory that contains your audio files
audio_dir = 'audio_samples'

# Get a list of all the audio files in the directory
audio_files = os.listdir(audio_dir)

# Initialize lists to hold the file paths and labels
file_paths = []
labels = []

# Iterate over the audio files
for file in audio_files:
    # Get the full path to the file
    file_path = os.path.join(audio_dir, file)
    
    # Get the label from the filename
    label = file.split('_')[0]
    
    # Add the file path and label to the lists
    file_paths.append(file_path)
    labels.append(label)

# Create a DataFrame from the lists
df = pd.DataFrame({
    'file_name': file_paths,
    'transcription': labels
})

# Write the DataFrame to a CSV file
df.to_csv('metadata.csv', index=False)

## Train Test Split

In [None]:
import os
import shutil
import numpy as np

AUDIO_FILES = "augmented_audio_samples"
DATA_DIR = "data"

all_files = os.listdir(AUDIO_FILES)

# Get all classes in a list (without duplicates -> set)
classes = list(set([file.split("_", 1)[0] for file in files]))

# Create train, test dirs
os.makedirs(os.path.join(DATA_DIR, "train"), exist_ok=True)
os.makedirs(os.path.join(DATA_DIR, "test"), exist_ok=True)

for cls in classes:

    # Get all files for a class
    class_files = [file for file in all_files if file.startswith(cls)]

    # Shuffle files, train test split
    np.random.shuffle(class_files)
    split_index = int(0.8 * len(class_files))
    train_files = class_files[:split_index]
    test_files = class_files[split_index:]

    # Create train , test dirs for each class
    class_train_dir = os.path.join(DATA_DIR, "train", cls)
    class_test_dir = os.path.join(DATA_DIR, "test", cls)
    os.makedirs(class_train_dir, exist_ok=True)
    os.makedirs(class_test_dir, exist_ok=True)

    # Copy files to train, test dirs
    for file in train_files:
        shutil.copy(os.path.join(AUDIO_FILES, file), class_train_dir)
    for file in test_files:
        shutil.copy(os.path.join(AUDIO_FILES, file), class_test_dir)  

# ******************************

## B) Function - record and collect samples
- Putting every class in its separate folder

In [None]:
# Audio data collection function 
# Sorting in separate folders:
def record_audio(p, SAMPLE_RATE, FORMAT, CHANNELS, CHUNK, DEVICE_INDEX, DURATION, CLASSES):
    for cls in CLASSES:
        dir = os.path.join(SAMPLES_DIR, cls)
        if not os.path.exists(dir):
            os.mkdir(dir)
        print("Recording class: ", cls)
        
        for sample in range(SAMPLE_COUNT):
            print(f"Recording sample no.: {sample} / {SAMPLE_COUNT}")
            time.sleep(0.5)

            stream = p.open(format = FORMAT,
                            channels = CHANNELS,
                            rate = SAMPLE_RATE,
                            input = True,
                            frames_per_buffer = CHUNK,
                            input_device_index = DEVICE_INDEX)
            frames = []

            for i in range(0, int(SAMPLE_RATE / CHUNK * DURATION)):
                data = stream.read(CHUNK)
                frames.append(np.frombuffer(data, dtype=np.int16))
            
            stream.stop_stream()
            stream.close()
            
            """
            #playback
            print("Playback")
            
            stream = p.open(format = FORMAT,
                            channels = CHANNELS,
                            rate = SAMPLE_RATE,
                            output = True,
                            frames_per_buffer = CHUNK)
            
            for frame in frames:
                stream.write(frame)

            stream.stop_stream()
            stream.close()
            """

            record = np.concatenate(frames, axis=0)
            record = np.squeeze(record)
            trimmed, index = librosa.effects.trim(record, top_db=15)


            #record = librosa.util.normalize(record)

            filename = os.path.join(dir, f"{cls}_{sample}.wav")
            sf.write(filename, trimmed, SAMPLE_RATE)
            print("Saved at: ", filename)
            
    p.terminate()
    print("Finished recording.")


## B) Audio augmentation

In [None]:
# Putting every class to separate dict:
class_counts = {}

for dirpaths, dirnames, files in os.walk(SAMPLES_DIR):
    for file in files:
        #Load file
        audio_file = os.path.join(dirpaths, file)
        y, sr = librosa.load(audio_file, sr=None)

        # Get class name and count
        class_name = os.path.basename(dirpaths)
        if class_name not in class_counts:
            class_counts[class_name] = 0

        new_dir = dirpaths.replace(SAMPLES_DIR, AUG_SAMPLES_DIR)
        os.makedirs(new_dir, exist_ok=True)

        for i in range(NUM_AUGMENTED):
            # Augment image
            method = random.choice(["pitch", "stretch", "noise", "db"])

            if method == "pitch":
                steps = random.randint(-1, 1)  
                augmented = librosa.effects.pitch_shift(y, sr=sr, n_steps=steps)

            elif method == "stretch":
                rate = random.uniform(0.9, 1.1)  # Random rate for time stretching
                augmented = librosa.effects.time_stretch(y, rate=rate)

            elif method == "noise":
                noise = np.random.normal(0, 0.01, len(y))  # Gaussian noise
                augmented = y + noise

            elif method == "db":
                audio_segment = AudioSegment.from_wav(audio_file)
                db_change = random.randint(-10, 10)  # Random change in volume (dB)
                augmented_segment = audio_segment + db_change
                augmented = np.array(augmented_segment.get_array_of_samples())

            class_counts[class_name] += 1
            new_file = os.path.join(new_dir, f"{file.split('_', 1)[0]}_{class_counts[class_name]}.wav")
            sf.write(new_file, augmented, SAMPLE_RATE)

In [None]:
import os
import pandas as pd

def produce_metadata():
    # Define the directory that contains your audio files
    audio_dir = SAMPLES_DIR
    metadata = {
        "filepath": [],
        "label": []
    }

    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(audio_dir)):
        
        if dirpath is not audio_dir:
            label = dirpath.split("/")[-1]
            for f in filenames:
                file_path = os.path.join(dirpath, f)
                metadata["filepath"].append(file_path)
                metadata["label"].append(label)
        
    df = pd.DataFrame(metadata)

    # Write the DataFrame to a CSV file
    df.to_csv('metadata.csv', index=False)