In [1]:
import numpy as np
import librosa 
import h5py
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import pandas as pd

In [21]:
# открывать h5 файлы надо специальным классом
with h5py.File('../files/train.h5') as f:
    chunks_shape = f["chunks"].shape
    print(f"Loaded {chunks_shape[0]} records, {chunks_shape[1]} "
          f"chunks of length {chunks_shape[2]} each")
    # выберем только первые 2 фрагмента из каждой аудиозаписи
    train_2c_x = np.ndarray((chunks_shape[0], 2, chunks_shape[2]), dtype=float)
    for i, record in enumerate(f["chunks"]):
        train_2c_x[i] = record[0:2, :]
    # выгружаем метки классов
    raw_train_y = np.array(f["classes"])

Loaded 2628 records, 5 chunks of length 4410 each


In [22]:
# закодируем классы числами 0-3
class_encoding = {v: i for i, v in enumerate(np.unique(raw_train_y))}
print("Class encoding:", class_encoding)
# создадим функцию, которая будет кодировать классы для np.ndarray
encode_y = np.vectorize(class_encoding.get)
# закодируем считанные `y`
train_y = encode_y(raw_train_y)

Class encoding {b'Sound_Drum': 0, b'Sound_Guitar': 1, b'Sound_Piano': 2, b'Sound_Violin': 3}


In [23]:
# генератор функций, которая будет извлекать нужные фичи из кусочков

def create_feature_mapper(n_mfcc: int, n_contrast: int, n_chroma: int):
    def extract_features(chunks: np.ndarray) -> np.ndarray:
        length = n_mfcc + n_contrast + n_mfcc
        n_chunks = chunks.shape[0]
        x = np.ndarray((n_chunks, length))
        for i, chunk in enumerate(chunks):
            c = 0
            if n_mfcc > 0:
                x[i,:n_mfcc] = np.mean(
                    librosa.feature.mfcc(y=chunk, n_mfcc=n_mfcc), 
                    1)
                c += n_mfcc
            if n_contrast > 1:
                x[i, c : c + n_contrast] = np.mean(
                    librosa.feature.spectral_contrast(y=chunk, n_bands=n_contrast - 1),
                    1)
                c += n_contrast
            if n_chroma > 0:
                x[i, c : c + n_chroma] = np.mean(
                    librosa.feature.chroma_cens(y=chunk, n_chroma=n_chroma),
                    1)
        return x.reshape(length * n_chunks)
    return extract_features

In [50]:
mapper1 = create_feature_mapper(10, 7, 6)

In [28]:
train1 = np.array(list(map(mapper1, train_2c_x)))
train1

  return pitch_tuning(


array([[-2.67013083e+02,  2.64743504e+00,  5.98910195e+01, ...,
         2.24494835e-01,  2.24685239e-01,  2.24906156e-01],
       [-3.99005107e+02,  1.87495548e+02, -3.17588595e+01, ...,
         2.24494835e-01,  2.24685239e-01,  2.24906156e-01],
       [-3.12242263e+02,  1.76721839e+02, -3.85841875e+01, ...,
         1.84834065e-02,  1.75508422e-02,  1.66043270e-02],
       ...,
       [-3.11975553e+02,  2.41897188e+01, -1.09888666e+02, ...,
         5.66833063e-01,  5.66870817e-01,  5.66904186e-01],
       [-6.53860950e+02,  1.12662655e+02,  5.13715328e+01, ...,
         2.33843785e-01,  2.33746484e-01,  2.33651006e-01],
       [-5.19174017e+02,  3.46728566e+01, -2.40452995e+01, ...,
         7.91321065e-02,  7.91975249e-02,  7.92984824e-02]])

In [30]:
with h5py.File('../files/test.h5') as f:
    chunks_shape = f["chunks"].shape
    print(f"Loaded {chunks_shape[0]} records, {chunks_shape[1]} "
          f"chunks of length {chunks_shape[2]} each")
    # выберем только первые 2 фрагмента из каждой аудиозаписи
    test_2c_x = np.ndarray((chunks_shape[0], 2, chunks_shape[2]), dtype=float)
    for i, record in enumerate(f["chunks"]):
        test_2c_x[i] = record[0:2, :]
    # выгружаем метки классов
    raw_test_y = np.array(f["classes"])
test_y = encode_y(raw_test_y)

Loaded 80 records, 5 chunks of length 4410 each


In [31]:
test1 = np.array(list(map(mapper1, test_2c_x)))

  return pitch_tuning(


In [49]:
rfc = RandomForestClassifier(n_estimators=300, random_state=42)
rfc.fit(train1, train_y)
predicted_y = rfc.predict(test1)
sum(predicted_y == test_y) / len(test_y)

0.825

In [48]:
svc = svm.SVC(random_state=42)
svc.fit(train1, train_y)
predicted_y = svc.predict(test1)
sum(predicted_y == test_y) / len(test_y)

0.7625

In [53]:
kmeans = KMeans(n_clusters=4).fit(train1)
kmeans.labels_



array([0, 3, 0, ..., 0, 1, 3], dtype=int32)

In [15]:
from itertools import permutations

def cluster_accuracy(clusters, gt, n_classes):
    accuracies = []
    for p in permutations(range(n_classes)):
        mapped = np.vectorize({i: x for i, x in enumerate(p)}.get)(clusters)
        accuracies.append(sum(mapped == gt))
    return max(accuracies) / len(gt)

In [61]:
cluster_accuracy(kmeans.labels_, train_y, 4)

0.4341704718417047

In [63]:
with h5py.File('../files/mapped_data1.h5', 'w') as f:
    # train_x и test_x - выделенные фичи (вычисленные коэффициенты), train_y и test_y - метки 
    f.create_dataset("train_x", data=train1)
    f.create_dataset("train_y", data=train_y)
    f.create_dataset("test_x", data=test1)
    f.create_dataset("test_y", data=test_y)
    f.attrs['n_mfcc'] = 10
    f.attrs['n_contrast'] = 7
    f.attrs['n_chroma'] = 6
    f.attrs['n_chunks'] = 2

In [7]:
with h5py.File('../files/mapped_data1.h5') as f:
    train1  = np.array(f["train_x"])
    train_y = np.array(f["train_y"])
    test1   = np.array(f["test_x"])
    test_y  = np.array(f["test_y"])

In [5]:
from sklearn import preprocessing, metrics

In [8]:
scaler = preprocessing.StandardScaler()
scaled_x = scaler.fit_transform(train1)

In [13]:
kmeans = KMeans(n_clusters=4).fit(scaled_x)
kmeans.labels_



array([2, 1, 3, ..., 3, 0, 3], dtype=int32)

In [16]:
cluster_accuracy(kmeans.labels_, train_y, 4)

0.6301369863013698

In [18]:
pd.crosstab(kmeans.labels_, train_y)

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,258,135,9,1
1,25,303,438,7
2,397,104,51,6
3,20,158,30,686
