# 専門学生時代に扱った音楽データセットの分類に挑戦

In [None]:
import numpy as np
import sys
sys.path.append("..")
from matplotlib import pyplot as plt
import pandas as pd
import IPython.display as ipd
import librosa
import setuptools
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import  to_categorical # ワンホットエンコード化
from tensorflow.keras.preprocessing.sequence import pad_sequences # データの長さを揃える

In [15]:
# 学習データの読み込み
train_data_dir ="./audio_dataset_3class/train/"
train_df = pd.read_csv("./audio_dataset_3class/train.csv", index_col=0)

# テストデータの読み込み
test_data_dir ="./audio_dataset_3class/test/"
test_df = pd.read_csv("./audio_dataset_3class/test.csv", index_col=0)

In [20]:
train_df.head()

Unnamed: 0_level_0,label
fname,Unnamed: 1_level_1
969b4f60.wav,Cello
3e2bddda.wav,Cello
54bb57af.wav,Cello
9d59a719.wav,Applause
05f2c2a6.wav,Clarinet


In [21]:
train_df["label"].value_counts()

label
Clarinet    130
Cello       125
Applause     61
Name: count, dtype: int64

In [27]:
# チェロの音声データを一つ読み込む
data, rate = librosa.load(train_data_dir + train_df[train_df["label"] == "Applause"].index[0])
print(rate)
# 読み込んだ音声を再生する
ipd.Audio(data = data, rate = rate)

22050


In [31]:
data.shape, data

((124803,),
 array([-2.8487491e-05, -3.8222566e-05, -3.6145775e-05, ...,
        -2.9815617e-05, -2.1903921e-05, -1.1638074e-05], dtype=float32))

In [54]:
sampling_rate = 8000

# 音の長さを3秒に区切る
audio_duration = 3
audio_length = sampling_rate * audio_duration

# ファイル名から音声データを読み込む
def _load_files(data_dir, filenames):
    result = []

    for i, filename in enumerate(filenames):
        file_path = data_dir + filename
        data, _ = librosa.core.load(file_path, sr=sampling_rate, res_type="kaiser_fast")
        result.append(data)
    
    return result

# _load_files(train_data_dir, train_df.index)

# データの前処理
def create_audio_dataset(train_df, test_df, train_data_dir, test_data_dir, label_dict):
    dim = (audio_length, 1)
    train_filenames = train_df.index
    test_filenames = test_df.index

    # 音声ファイルを読み込む
    _X_train = _load_files(train_data_dir, train_filenames)
    _X_test = _load_files(test_data_dir, test_filenames)

    # 音声の長さを3秒間に揃える
    _X_train = pad_sequences(_X_train, dtype="float32", maxlen=audio_length, padding="pre", truncating="pre", value=0.0).tolist()
    _X_test = pad_sequences(_X_test, dtype="float32", maxlen=audio_length, padding="pre", truncating="pre", value=0.0).tolist()

    #音のデータをStandardScalerで平均値を0、分散を１に補正する
    scaler = StandardScaler()
    scaler = scaler.fit(_X_train + _X_test)
    _X_train = scaler.transform(_X_train)
    _X_test = scaler.transform(_X_test)

    X_train = np.empty((len(train_filenames), *dim))
    for index, data in enumerate(_X_train):
      X_train[index,] = [[d] for d in data]

    X_test = np.empty((len(test_filenames), *dim))
    for index, data in enumerate(_X_test):
      X_test[index,] = [[d] for d in data]

    #以下からはlabelの作成
    labels_train = train_df["label"]
    labels_test = test_df["label"]

    y_train = np.empty(len(labels_train), dtype=int)
    for i, label in enumerate(labels_train):
        y_train[i] = label_dict[label]

    y_test = np.empty(len(labels_test), dtype=int)
    for i, label in enumerate(labels_test):
        y_test[i] = label_dict[label]

    #one-hot encodingする
    Y_train = to_categorical(y_train, num_classes=len(label_dict))
    Y_test = to_categorical(y_test, num_classes=len(label_dict))

    return X_train, Y_train, X_test, Y_test


In [55]:
audio_label_dict = {"Cello": 0, "Clarinet": 1, "Applause": 2}
X_train, Y_train, X_test, Y_test = create_audio_dataset(train_df, test_df, train_data_dir, test_data_dir, audio_label_dict)

In [59]:
X_train.shape, X_train[0].shape

((316, 24000, 1), (24000, 1))