# 1. Import Libraries

In [1]:
# !pip install opencv-python

In [2]:
# !pip install librosa

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from glob import glob
import librosa
import cv2
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings('ignore')

# 2. Read Data

In [4]:
def get_img(voice_data, sampling_data, mode):
    if mode == 'spec':
        stft = np.abs(librosa.core.spectrum.stft(voice_data))
        return librosa.amplitude_to_db(stft, ref=np.max)
    elif mode == 'mel':
        stft = np.abs(librosa.feature.melspectrogram(voice_data))
        return librosa.amplitude_to_db(stft, ref=np.max)
    elif mode == 'chrom':
        stft = np.abs(librosa.core.spectrum.stft(voice_data))
        return librosa.feature.chroma_stft(S=stft, sr=sampling_rate)

In [5]:
classes = ['go', 'no', 'stop', 'yes']

In [6]:
mode = 'mel'

width = 256
height = 32

X = np.empty([0, width*height])
y = np.empty([0, 1])

for _class in tqdm(classes):
    sound_path = glob('dataset/' + _class + '/*')
    for path in tqdm(sound_path):
        voice_data, sampling_rate = librosa.load(path)
        img = get_img(voice_data, sampling_rate, mode)
        img = cv2.resize(img, dsize=(width, height))
        img = img.reshape(1, -1)
        X = np.vstack([X, img])
        y = np.vstack([y, _class])

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2372 [00:00<?, ?it/s]

  0%|          | 0/2375 [00:00<?, ?it/s]

  0%|          | 0/2380 [00:00<?, ?it/s]

  0%|          | 0/2377 [00:00<?, ?it/s]

# 3. Save to CSV

In [7]:
columns = [f'pixel_{i}' for i in range(width*height)]

data = pd.DataFrame(X, columns=columns)
data['label'] = y

data.to_csv('voice_command_dataset.csv', index=False)