In [1]:
# !pip install librosa
# !pip install opencv-python

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from glob import glob
import librosa
import cv2
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    OrdinalEncoder, 
    OneHotEncoder, 
    StandardScaler, 
    MinMaxScaler
)

## ========== Sound ==========

In [3]:
def get_img(voice_data, sampling_data, mode):
    if mode == 'spec':
        stft = np.abs(librosa.core.spectrum.stft(voice_data))
        return librosa.amplitude_to_db(stft, ref=np.max)
    elif mode == 'mel':
        stft = np.abs(librosa.feature.melspectrogram(voice_data))
        return librosa.amplitude_to_db(stft, ref=np.max)
    elif mode == 'chrom':
        stft = np.abs(librosa.core.spectrum.stft(voice_data))
        return librosa.feature.chroma_stft(S=stft, sr=sampling_rate)

In [4]:
classes = ['cat', 'dog']

In [5]:
mode = 'mel'

width = 256
height = 32

X = np.empty([0, width*height])
y = np.empty([0, 1])

for _class in tqdm(classes):
    sound_path = glob('sound_dataset/' + _class + '/*')
    for path in tqdm(sound_path):
        voice_data, sampling_rate = librosa.load(path)
        img = get_img(voice_data, sampling_rate, mode)
        img = cv2.resize(img, dsize=(width, height))
        img = img.reshape(1, -1)
        X = np.vstack([X, img])
        y = np.vstack([y, _class])

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

 -0.00169399] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
 -0.00019849] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
 -0.00012315] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
 -0.01206966] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.me

  0%|          | 0/10 [00:00<?, ?it/s]

  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
  5.1710312e-03  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
 -0.01079411] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
  0.00950398] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
  stft = np.abs(librosa.feature.melspectrogram(voice_data))
  0.   

In [6]:
X

array([[-80.        , -80.        , -80.        , ..., -80.        ,
        -80.        , -80.        ],
       [-77.60639954, -77.60639954, -77.26318359, ..., -80.        ,
        -80.        , -80.        ],
       [-80.        , -80.        , -66.35562134, ..., -80.        ,
        -80.        , -80.        ],
       ...,
       [-80.        , -80.        , -80.        , ..., -80.        ,
        -80.        , -80.        ],
       [-79.99999237, -79.99999237, -79.99999237, ..., -79.99999237,
        -79.99999237, -79.99999237],
       [-80.        , -80.        , -80.        , ..., -80.        ,
        -80.        , -80.        ]])

In [7]:
y

array([['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['dog']], dtype='<U32')