In [54]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import MaxPooling2D, Conv2D, Dense, Flatten
from keras import layers, models
import pandas as pd
import numpy as np

import librosa
import librosa.display
import matplotlib.pyplot as plt

from pathlib import Path

In [55]:
duration = 20
offset = 10

In [56]:
def load_csv():
    data_csv = pd.read_csv("data.csv", skipinitialspace=True)
    genre = data_csv[["track id", "genre"]]
    labels = data_csv.drop(
        columns=["mood", "age", "gender", "disliked", "liked", "mother tongue", "genre"]
    )  # unnecessary columns
    labels = labels.groupby("track id").mean()
    for column in labels.columns:
        labels[column] = labels[column].apply(lambda x: 1 if (x >= 0.5) else 0)
        print(labels[column].value_counts())
        print()
    genre_cleared = genre.drop_duplicates().set_index("track id")
    y = labels
    y["genre"] = genre_cleared["genre"]
    y = pd.get_dummies(y)
    return y


y = load_csv()

0    397
1      3
Name: amazement, dtype: int64

0    381
1     19
Name: solemnity, dtype: int64

0    373
1     27
Name: tenderness, dtype: int64

0    354
1     46
Name: nostalgia, dtype: int64

0    311
1     89
Name: calmness, dtype: int64

0    366
1     34
Name: power, dtype: int64

0    317
1     83
Name: joyful_activation, dtype: int64

0    355
1     45
Name: tension, dtype: int64

0    379
1     21
Name: sadness, dtype: int64



In [57]:
def padding(array, xx, yy):
    """
    :param array: numpy array
    :param xx: desired height
    :param yy: desirex width
    :return: padded array
    """

    h = array.shape[0]
    w = array.shape[1]
    a = max((xx - h) // 2, 0)
    aa = max(0, xx - a - h)
    b = max(0, (yy - w) // 2)
    bb = max(yy - b - w, 0)
    return np.pad(array, pad_width=((a, aa), (b, bb)), mode="constant")

In [68]:
librosa.get_duration(filename='audio/classical/40.mp3')

34.963356009070296

In [59]:
def get_spec(y):
    spec = np.abs(librosa.stft(y, n_fft=256, hop_length=512))
    return spec

In [69]:
def load_audio():
    x_list = []
    audio_path = Path('audio')
    genres = audio_path.glob("*")
    for genre in genres:
        for i in range(1,101,1):
            offset = 10
            duration = 25
            print(f"{genre}/{i}.mp3")
            real_duration = int(librosa.get_duration(filename=f"{genre}/{i}.mp3"))
            if duration + offset >= real_duration:
               offset = 0
        
            y, sr = librosa.load((f"{genre}/{i}.mp3"), duration=duration, offset=offset)
            #y, sr = librosa.load((f"{genre}/{i}.mp3"), duration=duration)    
            x = get_spec(y)
            x = padding(x, 129, 1000)
            x_list.append(np.expand_dims(x, 2))
    return x_list

x = load_audio()

audio\classical/1.mp3
audio\classical/2.mp3
audio\classical/3.mp3
audio\classical/4.mp3
audio\classical/5.mp3
audio\classical/6.mp3
audio\classical/7.mp3
audio\classical/8.mp3
audio\classical/9.mp3
audio\classical/10.mp3
audio\classical/11.mp3
audio\classical/12.mp3
audio\classical/13.mp3
audio\classical/14.mp3
audio\classical/15.mp3
audio\classical/16.mp3
audio\classical/17.mp3
audio\classical/18.mp3
audio\classical/19.mp3
audio\classical/20.mp3
audio\classical/21.mp3
audio\classical/22.mp3
audio\classical/23.mp3
audio\classical/24.mp3
audio\classical/25.mp3
audio\classical/26.mp3
audio\classical/27.mp3
audio\classical/28.mp3
audio\classical/29.mp3
audio\classical/30.mp3
audio\classical/31.mp3
audio\classical/32.mp3
audio\classical/33.mp3
audio\classical/34.mp3
audio\classical/35.mp3
audio\classical/36.mp3
audio\classical/37.mp3
audio\classical/38.mp3
audio\classical/39.mp3
audio\classical/40.mp3
audio\classical/41.mp3
audio\classical/42.mp3
audio\classical/43.mp3
audio\classical/44.m

In [88]:
input_shape = (128, 1077, 1)
CNNmodel = models.Sequential()
CNNmodel.add(layers.Conv2D(32, (3, 3), activation="relu", input_shape=input_shape))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Dropout(0.2))
CNNmodel.add(layers.Conv2D(64, (3, 3), activation="relu"))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Dropout(0.2))
CNNmodel.add(layers.Conv2D(64, (3, 3), activation="relu"))
CNNmodel.add(layers.Flatten())
CNNmodel.add(layers.Dense(64, activation="relu"))
CNNmodel.add(layers.Dropout(0.2))
CNNmodel.add(layers.Dense(32, activation="relu"))
CNNmodel.add(layers.Dense(13, activation="sigmoid"))

In [89]:
CNNmodel.compile(
    optimizer="adam",
    loss='binary_crossentropy',
    metrics=["accuracy"],
)

In [97]:
x

[array([[[5.8722947e-02],
         [4.4591323e-02],
         [1.4723657e-02],
         ...,
         [7.6974779e-02],
         [1.9434549e-02],
         [2.7214454e-02]],
 
        [[7.9949498e-02],
         [3.5212100e-01],
         [2.0547669e-01],
         ...,
         [5.3371780e-02],
         [3.0544216e-02],
         [8.9730948e-02]],
 
        [[1.6232872e-01],
         [7.1832037e-01],
         [3.5663542e-01],
         ...,
         [9.2396095e-02],
         [2.3568599e-01],
         [1.2071292e-01]],
 
        ...,
 
        [[3.3040321e-03],
         [3.1987549e-06],
         [1.5253228e-06],
         ...,
         [4.6358622e-07],
         [4.6368827e-06],
         [1.2768753e-06]],
 
        [[2.9019646e-03],
         [1.9376093e-06],
         [7.9424359e-07],
         ...,
         [1.7943950e-06],
         [1.9828015e-06],
         [7.9890691e-07]],
 
        [[2.7668267e-03],
         [1.4273154e-06],
         [1.0461198e-06],
         ...,
         [4.2766658e-07],
  

In [80]:
y.shape

(400, 13)

In [87]:
history = CNNmodel.fit(
    x=xd,
    y=y,
    epochs=100,
    validation_split=0.2
)

Epoch 1/100


ValueError: in user code:

    File "e:\coding\github\audio-music-emotion-classification\venv\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "e:\coding\github\audio-music-emotion-classification\venv\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "e:\coding\github\audio-music-emotion-classification\venv\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "e:\coding\github\audio-music-emotion-classification\venv\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "e:\coding\github\audio-music-emotion-classification\venv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "e:\coding\github\audio-music-emotion-classification\venv\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_3" is incompatible with the layer: expected shape=(None, 128, 1077, 1), found shape=(32, 129, 1077, 1)


In [None]:
g = CNNmodel.predict(xd)

