In [None]:
# GC mount
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# dataset DL(https://drive.google.com/open?id=1TfHtRoX73yIJd40QNLEhYzgH_wlhQSLz)
# dataset コピー
!cp "/content/drive/MyDrive/Colab Notebooks/note_Axross/tr-music-dataset.rar" /content/
# dataset 解凍
!unrar x tr-music-dataset.rar

In [None]:
# ライブラリのインポート

# feature extractoring and preprocessing data
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL import Image
import pathlib
import csv
from tqdm import tqdm

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Keras
import keras
from keras.preprocessing.image import ImageDataGenerator

import warnings
warnings.filterwarnings('ignore')

In [None]:
# モデル生成（1次元分類、音響的特徴量をdata.csvに変換）
header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()

moods = 'angry happy relax sad'.split()
file = open('data.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)
for g in moods:
    for filename in tqdm(os.listdir(f'./tr-music-dataset/{g}')):
        songname = f'./tr-music-dataset/{g}/{filename}'
        y, sr = librosa.load(songname, mono=True, duration=30)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        # rmse = librosa.feature.rmse(y=y)
        rmse = librosa.feature.rms(y=y) # librosaのバージョンが0.7.0以上の場合はこちらを実行
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        to_append += f' {g}'
        file = open('data.csv', 'a', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(to_append.split())

In [None]:
# 特徴量データの確認
data = pd.read_csv('data.csv', usecols=range(0, 28))
data = data[data['label'].isin(moods)].reset_index(drop=True)
data.head()

In [None]:
# 特徴量データのファイルサイズ確認
data.shape

In [None]:
# 不要データ列の削除
data = data.drop(['filename'],axis=1)

In [None]:
# 学習データ・正解データの加工
genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)

In [None]:
# データの正規化
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))

In [None]:
# データの分離
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# 学習モデルの作成
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(4, activation='softmax'))

In [None]:
# モデルのコンパイル
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# モデルの保存
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    validation_data=(X_test,y_test),
                    batch_size=20)

In [None]:
# 推論データ(https://drive.google.com/file/d/1hHKyQsjOPDijk89OM_bZwv4MrQQAV7U2/view?usp=sharing)

# 推論データ特徴量書出し
file = open('test.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)

filename = 'test.mp3'
y, sr = librosa.load(filename, mono=True, duration=30)
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
rmse = librosa.feature.rms(y=y)
spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
zcr = librosa.feature.zero_crossing_rate(y)
mfcc = librosa.feature.mfcc(y=y, sr=sr)
to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'  
for e in mfcc:
    to_append += f' {np.mean(e)}'

file = open('test.csv', 'a', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(to_append.split())

In [None]:
# 推論（モデル利用）
test = pd.read_csv('test.csv')
test = test.drop(['filename'],axis=1)

scaler = StandardScaler()
X = scaler.fit_transform(np.array(test.iloc[:, :-1], dtype = float))

predictions = model.predict(X)
print(predictions)
np.argmax(predictions[0])

In [None]:
# スペクトログラム（値変換による画像化）
cmap = plt.get_cmap('inferno')

plt.figure(figsize=(10,10))
moods = 'angry happy relax sad'.split()
for g in tqdm(moods):
    i = 0
    os.makedirs(f'img_data/train/{g}', exist_ok=True)
    os.makedirs(f'img_data/test/{g}', exist_ok=True)
    for filename in os.listdir(f'./tr-music-dataset/{g}'):
        i = i + 1
        songname = f'./tr-music-dataset/{g}/{filename}'
        y, sr = librosa.load(songname, mono=True, duration=5)
        plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
        plt.axis('off');
        if i % 5 != 0:
            plt.savefig(f'img_data/train/{g}/{filename[:-3].replace(".", "")}.png')
        else:
            plt.savefig(f'img_data/test/{g}/{filename[:-3].replace(".", "")}.png')
        plt.clf()

In [None]:
# 学習データの加工
train_datagen = ImageDataGenerator(
    rescale = 1.0 / 255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip = True
)

In [None]:
# 学習データの加工
from tensorflow.keras.applications import VGG16
from tensorflow.keras import models, optimizers
from keras.layers import Input, Dense, Dropout, Activation, Flatten
from keras.preprocessing.image import load_img,ImageDataGenerator
from keras.callbacks import CSVLogger , ModelCheckpoint

image_size = 224
batch_size = 20

train_dir = 'img_data/train/'
test_dir = 'img_data/test/'

train_datagen = ImageDataGenerator(rescale=1.0 / 255)
validation_datagen = ImageDataGenerator(rescale=1.0 / 255)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(image_size, image_size),
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=True
)

validation_generator = validation_datagen.flow_from_directory(
    test_dir,
    target_size=(image_size, image_size),
    batch_size=20,
    class_mode='categorical',
    shuffle=True
)

In [None]:
# FineTuning, モデル作成
vgg_conv = VGG16(weights='imagenet', include_top=False,
                 input_shape=(image_size, image_size, 3))
for layer in vgg_conv.layers[:-4]:
    layer.trainable = True

model = models.Sequential()
model.add(vgg_conv)

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

model.summary()

model.compile(optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# 学習の実行とモデルの保存
hist=model.fit_generator(
    train_generator,
    steps_per_epoch=16,
    epochs=20,
    verbose=1,
    validation_data=validation_generator,
    validation_steps=4
)

model.save('music_classification.h5')

In [None]:
# 推論

# スペクトログラムファイル作成
cmap = plt.get_cmap('inferno')
filename = 'test.mp3'
plt.figure(figsize=(10,10))
pathlib.Path(f'img_data').mkdir(parents=True, exist_ok=True)     
y, sr = librosa.load(filename, mono=True, duration=5)
plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'img_data/{filename[:-3].replace(".", "")}.png')
plt.clf()

In [None]:
# 推論
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing.image import load_img, img_to_array

filename = 'img_data/test.png'
image = load_img(filename, target_size=(224, 224))
image = img_to_array(image)
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
image = preprocess_input(image)
predictions = model.predict(image)
np.argmax(predictions[0])